ReactOS 0.4.16-dev-41-ge8c7597
xmlregexp.c
Go to the documentation of this file.
1/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/schemas mechanisms now available in
6 * XML related specifications these include:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
22/* #define DEBUG_ERR */
23
24#include <stdio.h>
25#include <string.h>
26#include <limits.h>
27
28#include <libxml/tree.h>
30#include <libxml/xmlregexp.h>
31#include <libxml/xmlautomata.h>
32#include <libxml/xmlunicode.h>
33
34#ifndef SIZE_MAX
35#define SIZE_MAX ((size_t) -1)
36#endif
37
38/* #define DEBUG_REGEXP_GRAPH */
39/* #define DEBUG_REGEXP_EXEC */
40/* #define DEBUG_PUSH */
41/* #define DEBUG_COMPACTION */
42
43#define MAX_PUSH 10000000
44
45#ifdef ERROR
46#undef ERROR
47#endif
48#define ERROR(str) \
49 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
50 xmlRegexpErrCompile(ctxt, str);
51#define NEXT ctxt->cur++
52#define CUR (*(ctxt->cur))
53#define NXT(index) (ctxt->cur[index])
54
55#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
56#define NEXTL(l) ctxt->cur += l;
57#define XML_REG_STRING_SEPARATOR '|'
58/*
59 * Need PREV to check on a '-' within a Character Group. May only be used
60 * when it's guaranteed that cur is not at the beginning of ctxt->string!
61 */
62#define PREV (ctxt->cur[-1])
63
69#define TODO \
70 xmlGenericError(xmlGenericErrorContext, \
71 "Unimplemented block at %s:%d\n", \
72 __FILE__, __LINE__);
73
74/************************************************************************
75 * *
76 * Datatypes and structures *
77 * *
78 ************************************************************************/
79
80/*
81 * Note: the order of the enums below is significant, do not shuffle
82 */
83typedef enum {
84 XML_REGEXP_EPSILON = 1,
85 XML_REGEXP_CHARVAL,
86 XML_REGEXP_RANGES,
87 XML_REGEXP_SUBREG, /* used for () sub regexps */
88 XML_REGEXP_STRING,
89 XML_REGEXP_ANYCHAR, /* . */
90 XML_REGEXP_ANYSPACE, /* \s */
91 XML_REGEXP_NOTSPACE, /* \S */
92 XML_REGEXP_INITNAME, /* \l */
93 XML_REGEXP_NOTINITNAME, /* \L */
94 XML_REGEXP_NAMECHAR, /* \c */
95 XML_REGEXP_NOTNAMECHAR, /* \C */
96 XML_REGEXP_DECIMAL, /* \d */
97 XML_REGEXP_NOTDECIMAL, /* \D */
98 XML_REGEXP_REALCHAR, /* \w */
99 XML_REGEXP_NOTREALCHAR, /* \W */
100 XML_REGEXP_LETTER = 100,
101 XML_REGEXP_LETTER_UPPERCASE,
102 XML_REGEXP_LETTER_LOWERCASE,
103 XML_REGEXP_LETTER_TITLECASE,
104 XML_REGEXP_LETTER_MODIFIER,
105 XML_REGEXP_LETTER_OTHERS,
106 XML_REGEXP_MARK,
107 XML_REGEXP_MARK_NONSPACING,
108 XML_REGEXP_MARK_SPACECOMBINING,
109 XML_REGEXP_MARK_ENCLOSING,
110 XML_REGEXP_NUMBER,
111 XML_REGEXP_NUMBER_DECIMAL,
112 XML_REGEXP_NUMBER_LETTER,
113 XML_REGEXP_NUMBER_OTHERS,
114 XML_REGEXP_PUNCT,
115 XML_REGEXP_PUNCT_CONNECTOR,
116 XML_REGEXP_PUNCT_DASH,
117 XML_REGEXP_PUNCT_OPEN,
118 XML_REGEXP_PUNCT_CLOSE,
119 XML_REGEXP_PUNCT_INITQUOTE,
120 XML_REGEXP_PUNCT_FINQUOTE,
121 XML_REGEXP_PUNCT_OTHERS,
122 XML_REGEXP_SEPAR,
123 XML_REGEXP_SEPAR_SPACE,
124 XML_REGEXP_SEPAR_LINE,
125 XML_REGEXP_SEPAR_PARA,
126 XML_REGEXP_SYMBOL,
127 XML_REGEXP_SYMBOL_MATH,
128 XML_REGEXP_SYMBOL_CURRENCY,
129 XML_REGEXP_SYMBOL_MODIFIER,
130 XML_REGEXP_SYMBOL_OTHERS,
131 XML_REGEXP_OTHER,
132 XML_REGEXP_OTHER_CONTROL,
133 XML_REGEXP_OTHER_FORMAT,
134 XML_REGEXP_OTHER_PRIVATE,
135 XML_REGEXP_OTHER_NA,
136 XML_REGEXP_BLOCK_NAME
137} xmlRegAtomType;
138
139typedef enum {
140 XML_REGEXP_QUANT_EPSILON = 1,
141 XML_REGEXP_QUANT_ONCE,
142 XML_REGEXP_QUANT_OPT,
143 XML_REGEXP_QUANT_MULT,
144 XML_REGEXP_QUANT_PLUS,
145 XML_REGEXP_QUANT_ONCEONLY,
146 XML_REGEXP_QUANT_ALL,
147 XML_REGEXP_QUANT_RANGE
148} xmlRegQuantType;
149
150typedef enum {
151 XML_REGEXP_START_STATE = 1,
152 XML_REGEXP_FINAL_STATE,
153 XML_REGEXP_TRANS_STATE,
154 XML_REGEXP_SINK_STATE,
155 XML_REGEXP_UNREACH_STATE
156} xmlRegStateType;
157
158typedef enum {
159 XML_REGEXP_MARK_NORMAL = 0,
160 XML_REGEXP_MARK_START,
161 XML_REGEXP_MARK_VISITED
162} xmlRegMarkedType;
163
164typedef struct _xmlRegRange xmlRegRange;
165typedef xmlRegRange *xmlRegRangePtr;
166
167struct _xmlRegRange {
168 int neg; /* 0 normal, 1 not, 2 exclude */
169 xmlRegAtomType type;
170 int start;
171 int end;
172 xmlChar *blockName;
173};
174
175typedef struct _xmlRegAtom xmlRegAtom;
176typedef xmlRegAtom *xmlRegAtomPtr;
177
178typedef struct _xmlAutomataState xmlRegState;
179typedef xmlRegState *xmlRegStatePtr;
180
181struct _xmlRegAtom {
182 int no;
183 xmlRegAtomType type;
184 xmlRegQuantType quant;
185 int min;
186 int max;
187
188 void *valuep;
189 void *valuep2;
190 int neg;
191 int codepoint;
192 xmlRegStatePtr start;
193 xmlRegStatePtr start0;
194 xmlRegStatePtr stop;
195 int maxRanges;
196 int nbRanges;
197 xmlRegRangePtr *ranges;
198 void *data;
199};
200
201typedef struct _xmlRegCounter xmlRegCounter;
202typedef xmlRegCounter *xmlRegCounterPtr;
203
204struct _xmlRegCounter {
205 int min;
206 int max;
207};
208
209typedef struct _xmlRegTrans xmlRegTrans;
210typedef xmlRegTrans *xmlRegTransPtr;
211
212struct _xmlRegTrans {
213 xmlRegAtomPtr atom;
214 int to;
215 int counter;
216 int count;
217 int nd;
218};
219
220struct _xmlAutomataState {
221 xmlRegStateType type;
222 xmlRegMarkedType mark;
223 xmlRegMarkedType markd;
224 xmlRegMarkedType reached;
225 int no;
226 int maxTrans;
227 int nbTrans;
228 xmlRegTrans *trans;
229 /* knowing states pointing to us can speed things up */
230 int maxTransTo;
231 int nbTransTo;
232 int *transTo;
233};
234
235typedef struct _xmlAutomata xmlRegParserCtxt;
236typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
237
238#define AM_AUTOMATA_RNG 1
239
240struct _xmlAutomata {
242 xmlChar *cur;
243
244 int error;
245 int neg;
246
247 xmlRegStatePtr start;
248 xmlRegStatePtr end;
249 xmlRegStatePtr state;
250
251 xmlRegAtomPtr atom;
252
253 int maxAtoms;
254 int nbAtoms;
255 xmlRegAtomPtr *atoms;
256
257 int maxStates;
258 int nbStates;
259 xmlRegStatePtr *states;
260
261 int maxCounters;
262 int nbCounters;
263 xmlRegCounter *counters;
264
265 int determinist;
266 int negs;
267 int flags;
268
269 int depth;
270};
271
272struct _xmlRegexp {
274 int nbStates;
275 xmlRegStatePtr *states;
276 int nbAtoms;
277 xmlRegAtomPtr *atoms;
278 int nbCounters;
279 xmlRegCounter *counters;
280 int determinist;
281 int flags;
282 /*
283 * That's the compact form for determinists automatas
284 */
285 int nbstates;
286 int *compact;
287 void **transdata;
288 int nbstrings;
289 xmlChar **stringMap;
290};
291
292typedef struct _xmlRegExecRollback xmlRegExecRollback;
293typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
294
295struct _xmlRegExecRollback {
296 xmlRegStatePtr state;/* the current state */
297 int index; /* the index in the input stack */
298 int nextbranch; /* the next transition to explore in that state */
299 int *counts; /* save the automata state if it has some */
300};
301
302typedef struct _xmlRegInputToken xmlRegInputToken;
303typedef xmlRegInputToken *xmlRegInputTokenPtr;
304
305struct _xmlRegInputToken {
306 xmlChar *value;
307 void *data;
308};
309
310struct _xmlRegExecCtxt {
311 int status; /* execution status != 0 indicate an error */
312 int determinist; /* did we find an indeterministic behaviour */
313 xmlRegexpPtr comp; /* the compiled regexp */
314 xmlRegExecCallbacks callback;
315 void *data;
316
317 xmlRegStatePtr state;/* the current state */
318 int transno; /* the current transition on that state */
319 int transcount; /* the number of chars in char counted transitions */
320
321 /*
322 * A stack of rollback states
323 */
324 int maxRollbacks;
325 int nbRollbacks;
326 xmlRegExecRollback *rollbacks;
327
328 /*
329 * The state of the automata if any
330 */
331 int *counts;
332
333 /*
334 * The input stack
335 */
336 int inputStackMax;
337 int inputStackNr;
338 int index;
339 int *charStack;
340 const xmlChar *inputString; /* when operating on characters */
341 xmlRegInputTokenPtr inputStack;/* when operating on strings */
342
343 /*
344 * error handling
345 */
346 int errStateNo; /* the error state number */
347 xmlRegStatePtr errState; /* the error state */
348 xmlChar *errString; /* the string raising the error */
349 int *errCounts; /* counters at the error state */
350 int nbPush;
351};
352
353#define REGEXP_ALL_COUNTER 0x123456
354#define REGEXP_ALL_LAX_COUNTER 0x123457
355
356static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
357static void xmlRegFreeState(xmlRegStatePtr state);
358static void xmlRegFreeAtom(xmlRegAtomPtr atom);
359static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
360static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
361static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
362 int neg, int start, int end, const xmlChar *blockName);
363
364void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
365
366/************************************************************************
367 * *
368 * Regexp memory error handler *
369 * *
370 ************************************************************************/
377static void
378xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
379{
380 const char *regexp = NULL;
381 if (ctxt != NULL) {
382 regexp = (const char *) ctxt->string;
383 ctxt->error = XML_ERR_NO_MEMORY;
384 }
385 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
387 regexp, NULL, 0, 0,
388 "Memory allocation failed : %s\n", extra);
389}
390
397static void
398xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
399{
400 const char *regexp = NULL;
401 int idx = 0;
402
403 if (ctxt != NULL) {
404 regexp = (const char *) ctxt->string;
405 idx = ctxt->cur - ctxt->string;
406 ctxt->error = XML_REGEXP_COMPILE_ERROR;
407 }
408 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
410 regexp, NULL, idx, 0,
411 "failed to compile: %s\n", extra);
412}
413
414/************************************************************************
415 * *
416 * Allocation/Deallocation *
417 * *
418 ************************************************************************/
419
420static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
421
432static void*
433xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
434 size_t totalSize;
435 void *ret;
436
437 /* Check for overflow */
438 if (dim1 > SIZE_MAX / dim2 / elemSize)
439 return (NULL);
440 totalSize = dim1 * dim2 * elemSize;
441 ret = xmlMalloc(totalSize);
442 if (ret != NULL)
443 memset(ret, 0, totalSize);
444 return (ret);
445}
446
455static xmlRegexpPtr
456xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
457 xmlRegexpPtr ret;
458
459 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
460 if (ret == NULL) {
461 xmlRegexpErrMemory(ctxt, "compiling regexp");
462 return(NULL);
463 }
464 memset(ret, 0, sizeof(xmlRegexp));
465 ret->string = ctxt->string;
466 ret->nbStates = ctxt->nbStates;
467 ret->states = ctxt->states;
468 ret->nbAtoms = ctxt->nbAtoms;
469 ret->atoms = ctxt->atoms;
470 ret->nbCounters = ctxt->nbCounters;
471 ret->counters = ctxt->counters;
472 ret->determinist = ctxt->determinist;
473 ret->flags = ctxt->flags;
474 if (ret->determinist == -1) {
475 xmlRegexpIsDeterminist(ret);
476 }
477
478 if ((ret->determinist != 0) &&
479 (ret->nbCounters == 0) &&
480 (ctxt->negs == 0) &&
481 (ret->atoms != NULL) &&
482 (ret->atoms[0] != NULL) &&
483 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
484 int i, j, nbstates = 0, nbatoms = 0;
485 int *stateRemap;
486 int *stringRemap;
487 int *transitions;
488 void **transdata;
489 xmlChar **stringMap;
490 xmlChar *value;
491
492 /*
493 * Switch to a compact representation
494 * 1/ counting the effective number of states left
495 * 2/ counting the unique number of atoms, and check that
496 * they are all of the string type
497 * 3/ build a table state x atom for the transitions
498 */
499
500 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
501 if (stateRemap == NULL) {
502 xmlRegexpErrMemory(ctxt, "compiling regexp");
503 xmlFree(ret);
504 return(NULL);
505 }
506 for (i = 0;i < ret->nbStates;i++) {
507 if (ret->states[i] != NULL) {
508 stateRemap[i] = nbstates;
509 nbstates++;
510 } else {
511 stateRemap[i] = -1;
512 }
513 }
514#ifdef DEBUG_COMPACTION
515 printf("Final: %d states\n", nbstates);
516#endif
517 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
518 if (stringMap == NULL) {
519 xmlRegexpErrMemory(ctxt, "compiling regexp");
520 xmlFree(stateRemap);
521 xmlFree(ret);
522 return(NULL);
523 }
524 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
525 if (stringRemap == NULL) {
526 xmlRegexpErrMemory(ctxt, "compiling regexp");
527 xmlFree(stringMap);
528 xmlFree(stateRemap);
529 xmlFree(ret);
530 return(NULL);
531 }
532 for (i = 0;i < ret->nbAtoms;i++) {
533 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
534 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
535 value = ret->atoms[i]->valuep;
536 for (j = 0;j < nbatoms;j++) {
537 if (xmlStrEqual(stringMap[j], value)) {
538 stringRemap[i] = j;
539 break;
540 }
541 }
542 if (j >= nbatoms) {
543 stringRemap[i] = nbatoms;
544 stringMap[nbatoms] = xmlStrdup(value);
545 if (stringMap[nbatoms] == NULL) {
546 for (i = 0;i < nbatoms;i++)
547 xmlFree(stringMap[i]);
548 xmlFree(stringRemap);
549 xmlFree(stringMap);
550 xmlFree(stateRemap);
551 xmlFree(ret);
552 return(NULL);
553 }
554 nbatoms++;
555 }
556 } else {
557 xmlFree(stateRemap);
558 xmlFree(stringRemap);
559 for (i = 0;i < nbatoms;i++)
560 xmlFree(stringMap[i]);
561 xmlFree(stringMap);
562 xmlFree(ret);
563 return(NULL);
564 }
565 }
566#ifdef DEBUG_COMPACTION
567 printf("Final: %d atoms\n", nbatoms);
568#endif
569 transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
570 sizeof(int));
571 if (transitions == NULL) {
572 xmlFree(stateRemap);
573 xmlFree(stringRemap);
574 for (i = 0;i < nbatoms;i++)
575 xmlFree(stringMap[i]);
576 xmlFree(stringMap);
577 xmlFree(ret);
578 return(NULL);
579 }
580
581 /*
582 * Allocate the transition table. The first entry for each
583 * state corresponds to the state type.
584 */
585 transdata = NULL;
586
587 for (i = 0;i < ret->nbStates;i++) {
588 int stateno, atomno, targetno, prev;
589 xmlRegStatePtr state;
590 xmlRegTransPtr trans;
591
592 stateno = stateRemap[i];
593 if (stateno == -1)
594 continue;
595 state = ret->states[i];
596
597 transitions[stateno * (nbatoms + 1)] = state->type;
598
599 for (j = 0;j < state->nbTrans;j++) {
600 trans = &(state->trans[j]);
601 if ((trans->to == -1) || (trans->atom == NULL))
602 continue;
603 atomno = stringRemap[trans->atom->no];
604 if ((trans->atom->data != NULL) && (transdata == NULL)) {
605 transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
606 sizeof(void *));
607 if (transdata == NULL) {
608 xmlRegexpErrMemory(ctxt, "compiling regexp");
609 break;
610 }
611 }
612 targetno = stateRemap[trans->to];
613 /*
614 * if the same atom can generate transitions to 2 different
615 * states then it means the automata is not deterministic and
616 * the compact form can't be used !
617 */
618 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
619 if (prev != 0) {
620 if (prev != targetno + 1) {
621 ret->determinist = 0;
622#ifdef DEBUG_COMPACTION
623 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
624 i, j, trans->atom->no, trans->to, atomno, targetno);
625 printf(" previous to is %d\n", prev);
626#endif
627 if (transdata != NULL)
628 xmlFree(transdata);
629 xmlFree(transitions);
630 xmlFree(stateRemap);
631 xmlFree(stringRemap);
632 for (i = 0;i < nbatoms;i++)
633 xmlFree(stringMap[i]);
634 xmlFree(stringMap);
635 goto not_determ;
636 }
637 } else {
638#if 0
639 printf("State %d trans %d: atom %d to %d : %d to %d\n",
640 i, j, trans->atom->no, trans->to, atomno, targetno);
641#endif
642 transitions[stateno * (nbatoms + 1) + atomno + 1] =
643 targetno + 1; /* to avoid 0 */
644 if (transdata != NULL)
645 transdata[stateno * nbatoms + atomno] =
646 trans->atom->data;
647 }
648 }
649 }
650 ret->determinist = 1;
651#ifdef DEBUG_COMPACTION
652 /*
653 * Debug
654 */
655 for (i = 0;i < nbstates;i++) {
656 for (j = 0;j < nbatoms + 1;j++) {
657 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
658 }
659 printf("\n");
660 }
661 printf("\n");
662#endif
663 /*
664 * Cleanup of the old data
665 */
666 if (ret->states != NULL) {
667 for (i = 0;i < ret->nbStates;i++)
668 xmlRegFreeState(ret->states[i]);
669 xmlFree(ret->states);
670 }
671 ret->states = NULL;
672 ret->nbStates = 0;
673 if (ret->atoms != NULL) {
674 for (i = 0;i < ret->nbAtoms;i++)
675 xmlRegFreeAtom(ret->atoms[i]);
676 xmlFree(ret->atoms);
677 }
678 ret->atoms = NULL;
679 ret->nbAtoms = 0;
680
681 ret->compact = transitions;
682 ret->transdata = transdata;
683 ret->stringMap = stringMap;
684 ret->nbstrings = nbatoms;
685 ret->nbstates = nbstates;
686 xmlFree(stateRemap);
687 xmlFree(stringRemap);
688 }
689not_determ:
690 ctxt->string = NULL;
691 ctxt->nbStates = 0;
692 ctxt->states = NULL;
693 ctxt->nbAtoms = 0;
694 ctxt->atoms = NULL;
695 ctxt->nbCounters = 0;
696 ctxt->counters = NULL;
697 return(ret);
698}
699
708static xmlRegParserCtxtPtr
709xmlRegNewParserCtxt(const xmlChar *string) {
710 xmlRegParserCtxtPtr ret;
711
712 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
713 if (ret == NULL)
714 return(NULL);
715 memset(ret, 0, sizeof(xmlRegParserCtxt));
716 if (string != NULL)
717 ret->string = xmlStrdup(string);
718 ret->cur = ret->string;
719 ret->neg = 0;
720 ret->negs = 0;
721 ret->error = 0;
722 ret->determinist = -1;
723 return(ret);
724}
725
738static xmlRegRangePtr
739xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
740 int neg, xmlRegAtomType type, int start, int end) {
741 xmlRegRangePtr ret;
742
743 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
744 if (ret == NULL) {
745 xmlRegexpErrMemory(ctxt, "allocating range");
746 return(NULL);
747 }
748 ret->neg = neg;
749 ret->type = type;
750 ret->start = start;
751 ret->end = end;
752 return(ret);
753}
754
761static void
762xmlRegFreeRange(xmlRegRangePtr range) {
763 if (range == NULL)
764 return;
765
766 if (range->blockName != NULL)
767 xmlFree(range->blockName);
768 xmlFree(range);
769}
770
779static xmlRegRangePtr
780xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
781 xmlRegRangePtr ret;
782
783 if (range == NULL)
784 return(NULL);
785
786 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
787 range->end);
788 if (ret == NULL)
789 return(NULL);
790 if (range->blockName != NULL) {
791 ret->blockName = xmlStrdup(range->blockName);
792 if (ret->blockName == NULL) {
793 xmlRegexpErrMemory(ctxt, "allocating range");
794 xmlRegFreeRange(ret);
795 return(NULL);
796 }
797 }
798 return(ret);
799}
800
810static xmlRegAtomPtr
811xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
812 xmlRegAtomPtr ret;
813
814 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
815 if (ret == NULL) {
816 xmlRegexpErrMemory(ctxt, "allocating atom");
817 return(NULL);
818 }
819 memset(ret, 0, sizeof(xmlRegAtom));
820 ret->type = type;
821 ret->quant = XML_REGEXP_QUANT_ONCE;
822 ret->min = 0;
823 ret->max = 0;
824 return(ret);
825}
826
833static void
834xmlRegFreeAtom(xmlRegAtomPtr atom) {
835 int i;
836
837 if (atom == NULL)
838 return;
839
840 for (i = 0;i < atom->nbRanges;i++)
841 xmlRegFreeRange(atom->ranges[i]);
842 if (atom->ranges != NULL)
843 xmlFree(atom->ranges);
844 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
845 xmlFree(atom->valuep);
846 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
847 xmlFree(atom->valuep2);
848 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
849 xmlFree(atom->valuep);
850 xmlFree(atom);
851}
852
862static xmlRegAtomPtr
863xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
864 xmlRegAtomPtr ret;
865
866 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
867 if (ret == NULL) {
868 xmlRegexpErrMemory(ctxt, "copying atom");
869 return(NULL);
870 }
871 memset(ret, 0, sizeof(xmlRegAtom));
872 ret->type = atom->type;
873 ret->quant = atom->quant;
874 ret->min = atom->min;
875 ret->max = atom->max;
876 if (atom->nbRanges > 0) {
877 int i;
878
879 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
880 atom->nbRanges);
881 if (ret->ranges == NULL) {
882 xmlRegexpErrMemory(ctxt, "copying atom");
883 goto error;
884 }
885 for (i = 0;i < atom->nbRanges;i++) {
886 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
887 if (ret->ranges[i] == NULL)
888 goto error;
889 ret->nbRanges = i + 1;
890 }
891 }
892 return(ret);
893
894error:
895 xmlRegFreeAtom(ret);
896 return(NULL);
897}
898
899static xmlRegStatePtr
900xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
901 xmlRegStatePtr ret;
902
903 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
904 if (ret == NULL) {
905 xmlRegexpErrMemory(ctxt, "allocating state");
906 return(NULL);
907 }
908 memset(ret, 0, sizeof(xmlRegState));
909 ret->type = XML_REGEXP_TRANS_STATE;
910 ret->mark = XML_REGEXP_MARK_NORMAL;
911 return(ret);
912}
913
920static void
921xmlRegFreeState(xmlRegStatePtr state) {
922 if (state == NULL)
923 return;
924
925 if (state->trans != NULL)
926 xmlFree(state->trans);
927 if (state->transTo != NULL)
928 xmlFree(state->transTo);
929 xmlFree(state);
930}
931
938static void
939xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
940 int i;
941 if (ctxt == NULL)
942 return;
943
944 if (ctxt->string != NULL)
945 xmlFree(ctxt->string);
946 if (ctxt->states != NULL) {
947 for (i = 0;i < ctxt->nbStates;i++)
948 xmlRegFreeState(ctxt->states[i]);
949 xmlFree(ctxt->states);
950 }
951 if (ctxt->atoms != NULL) {
952 for (i = 0;i < ctxt->nbAtoms;i++)
953 xmlRegFreeAtom(ctxt->atoms[i]);
954 xmlFree(ctxt->atoms);
955 }
956 if (ctxt->counters != NULL)
957 xmlFree(ctxt->counters);
958 xmlFree(ctxt);
959}
960
961/************************************************************************
962 * *
963 * Display of Data structures *
964 * *
965 ************************************************************************/
966
967static void
968xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
969 switch (type) {
970 case XML_REGEXP_EPSILON:
971 fprintf(output, "epsilon "); break;
972 case XML_REGEXP_CHARVAL:
973 fprintf(output, "charval "); break;
974 case XML_REGEXP_RANGES:
975 fprintf(output, "ranges "); break;
976 case XML_REGEXP_SUBREG:
977 fprintf(output, "subexpr "); break;
978 case XML_REGEXP_STRING:
979 fprintf(output, "string "); break;
980 case XML_REGEXP_ANYCHAR:
981 fprintf(output, "anychar "); break;
982 case XML_REGEXP_ANYSPACE:
983 fprintf(output, "anyspace "); break;
984 case XML_REGEXP_NOTSPACE:
985 fprintf(output, "notspace "); break;
986 case XML_REGEXP_INITNAME:
987 fprintf(output, "initname "); break;
988 case XML_REGEXP_NOTINITNAME:
989 fprintf(output, "notinitname "); break;
990 case XML_REGEXP_NAMECHAR:
991 fprintf(output, "namechar "); break;
992 case XML_REGEXP_NOTNAMECHAR:
993 fprintf(output, "notnamechar "); break;
994 case XML_REGEXP_DECIMAL:
995 fprintf(output, "decimal "); break;
996 case XML_REGEXP_NOTDECIMAL:
997 fprintf(output, "notdecimal "); break;
998 case XML_REGEXP_REALCHAR:
999 fprintf(output, "realchar "); break;
1000 case XML_REGEXP_NOTREALCHAR:
1001 fprintf(output, "notrealchar "); break;
1002 case XML_REGEXP_LETTER:
1003 fprintf(output, "LETTER "); break;
1004 case XML_REGEXP_LETTER_UPPERCASE:
1005 fprintf(output, "LETTER_UPPERCASE "); break;
1006 case XML_REGEXP_LETTER_LOWERCASE:
1007 fprintf(output, "LETTER_LOWERCASE "); break;
1008 case XML_REGEXP_LETTER_TITLECASE:
1009 fprintf(output, "LETTER_TITLECASE "); break;
1010 case XML_REGEXP_LETTER_MODIFIER:
1011 fprintf(output, "LETTER_MODIFIER "); break;
1012 case XML_REGEXP_LETTER_OTHERS:
1013 fprintf(output, "LETTER_OTHERS "); break;
1014 case XML_REGEXP_MARK:
1015 fprintf(output, "MARK "); break;
1016 case XML_REGEXP_MARK_NONSPACING:
1017 fprintf(output, "MARK_NONSPACING "); break;
1018 case XML_REGEXP_MARK_SPACECOMBINING:
1019 fprintf(output, "MARK_SPACECOMBINING "); break;
1020 case XML_REGEXP_MARK_ENCLOSING:
1021 fprintf(output, "MARK_ENCLOSING "); break;
1022 case XML_REGEXP_NUMBER:
1023 fprintf(output, "NUMBER "); break;
1024 case XML_REGEXP_NUMBER_DECIMAL:
1025 fprintf(output, "NUMBER_DECIMAL "); break;
1026 case XML_REGEXP_NUMBER_LETTER:
1027 fprintf(output, "NUMBER_LETTER "); break;
1028 case XML_REGEXP_NUMBER_OTHERS:
1029 fprintf(output, "NUMBER_OTHERS "); break;
1030 case XML_REGEXP_PUNCT:
1031 fprintf(output, "PUNCT "); break;
1032 case XML_REGEXP_PUNCT_CONNECTOR:
1033 fprintf(output, "PUNCT_CONNECTOR "); break;
1034 case XML_REGEXP_PUNCT_DASH:
1035 fprintf(output, "PUNCT_DASH "); break;
1036 case XML_REGEXP_PUNCT_OPEN:
1037 fprintf(output, "PUNCT_OPEN "); break;
1038 case XML_REGEXP_PUNCT_CLOSE:
1039 fprintf(output, "PUNCT_CLOSE "); break;
1040 case XML_REGEXP_PUNCT_INITQUOTE:
1041 fprintf(output, "PUNCT_INITQUOTE "); break;
1042 case XML_REGEXP_PUNCT_FINQUOTE:
1043 fprintf(output, "PUNCT_FINQUOTE "); break;
1044 case XML_REGEXP_PUNCT_OTHERS:
1045 fprintf(output, "PUNCT_OTHERS "); break;
1046 case XML_REGEXP_SEPAR:
1047 fprintf(output, "SEPAR "); break;
1048 case XML_REGEXP_SEPAR_SPACE:
1049 fprintf(output, "SEPAR_SPACE "); break;
1050 case XML_REGEXP_SEPAR_LINE:
1051 fprintf(output, "SEPAR_LINE "); break;
1052 case XML_REGEXP_SEPAR_PARA:
1053 fprintf(output, "SEPAR_PARA "); break;
1054 case XML_REGEXP_SYMBOL:
1055 fprintf(output, "SYMBOL "); break;
1056 case XML_REGEXP_SYMBOL_MATH:
1057 fprintf(output, "SYMBOL_MATH "); break;
1058 case XML_REGEXP_SYMBOL_CURRENCY:
1059 fprintf(output, "SYMBOL_CURRENCY "); break;
1060 case XML_REGEXP_SYMBOL_MODIFIER:
1061 fprintf(output, "SYMBOL_MODIFIER "); break;
1062 case XML_REGEXP_SYMBOL_OTHERS:
1063 fprintf(output, "SYMBOL_OTHERS "); break;
1064 case XML_REGEXP_OTHER:
1065 fprintf(output, "OTHER "); break;
1066 case XML_REGEXP_OTHER_CONTROL:
1067 fprintf(output, "OTHER_CONTROL "); break;
1068 case XML_REGEXP_OTHER_FORMAT:
1069 fprintf(output, "OTHER_FORMAT "); break;
1070 case XML_REGEXP_OTHER_PRIVATE:
1071 fprintf(output, "OTHER_PRIVATE "); break;
1072 case XML_REGEXP_OTHER_NA:
1073 fprintf(output, "OTHER_NA "); break;
1074 case XML_REGEXP_BLOCK_NAME:
1075 fprintf(output, "BLOCK "); break;
1076 }
1077}
1078
1079static void
1080xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1081 switch (type) {
1082 case XML_REGEXP_QUANT_EPSILON:
1083 fprintf(output, "epsilon "); break;
1084 case XML_REGEXP_QUANT_ONCE:
1085 fprintf(output, "once "); break;
1086 case XML_REGEXP_QUANT_OPT:
1087 fprintf(output, "? "); break;
1088 case XML_REGEXP_QUANT_MULT:
1089 fprintf(output, "* "); break;
1090 case XML_REGEXP_QUANT_PLUS:
1091 fprintf(output, "+ "); break;
1092 case XML_REGEXP_QUANT_RANGE:
1093 fprintf(output, "range "); break;
1094 case XML_REGEXP_QUANT_ONCEONLY:
1095 fprintf(output, "onceonly "); break;
1096 case XML_REGEXP_QUANT_ALL:
1097 fprintf(output, "all "); break;
1098 }
1099}
1100static void
1101xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1102 fprintf(output, " range: ");
1103 if (range->neg)
1104 fprintf(output, "negative ");
1105 xmlRegPrintAtomType(output, range->type);
1106 fprintf(output, "%c - %c\n", range->start, range->end);
1107}
1108
1109static void
1110xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1111 fprintf(output, " atom: ");
1112 if (atom == NULL) {
1113 fprintf(output, "NULL\n");
1114 return;
1115 }
1116 if (atom->neg)
1117 fprintf(output, "not ");
1118 xmlRegPrintAtomType(output, atom->type);
1119 xmlRegPrintQuantType(output, atom->quant);
1120 if (atom->quant == XML_REGEXP_QUANT_RANGE)
1121 fprintf(output, "%d-%d ", atom->min, atom->max);
1122 if (atom->type == XML_REGEXP_STRING)
1123 fprintf(output, "'%s' ", (char *) atom->valuep);
1124 if (atom->type == XML_REGEXP_CHARVAL)
1125 fprintf(output, "char %c\n", atom->codepoint);
1126 else if (atom->type == XML_REGEXP_RANGES) {
1127 int i;
1128 fprintf(output, "%d entries\n", atom->nbRanges);
1129 for (i = 0; i < atom->nbRanges;i++)
1130 xmlRegPrintRange(output, atom->ranges[i]);
1131 } else if (atom->type == XML_REGEXP_SUBREG) {
1132 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1133 } else {
1134 fprintf(output, "\n");
1135 }
1136}
1137
1138static void
1139xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1140 fprintf(output, " trans: ");
1141 if (trans == NULL) {
1142 fprintf(output, "NULL\n");
1143 return;
1144 }
1145 if (trans->to < 0) {
1146 fprintf(output, "removed\n");
1147 return;
1148 }
1149 if (trans->nd != 0) {
1150 if (trans->nd == 2)
1151 fprintf(output, "last not determinist, ");
1152 else
1153 fprintf(output, "not determinist, ");
1154 }
1155 if (trans->counter >= 0) {
1156 fprintf(output, "counted %d, ", trans->counter);
1157 }
1158 if (trans->count == REGEXP_ALL_COUNTER) {
1159 fprintf(output, "all transition, ");
1160 } else if (trans->count >= 0) {
1161 fprintf(output, "count based %d, ", trans->count);
1162 }
1163 if (trans->atom == NULL) {
1164 fprintf(output, "epsilon to %d\n", trans->to);
1165 return;
1166 }
1167 if (trans->atom->type == XML_REGEXP_CHARVAL)
1168 fprintf(output, "char %c ", trans->atom->codepoint);
1169 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1170}
1171
1172static void
1173xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1174 int i;
1175
1176 fprintf(output, " state: ");
1177 if (state == NULL) {
1178 fprintf(output, "NULL\n");
1179 return;
1180 }
1181 if (state->type == XML_REGEXP_START_STATE)
1182 fprintf(output, "START ");
1183 if (state->type == XML_REGEXP_FINAL_STATE)
1184 fprintf(output, "FINAL ");
1185
1186 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1187 for (i = 0;i < state->nbTrans; i++) {
1188 xmlRegPrintTrans(output, &(state->trans[i]));
1189 }
1190}
1191
1192#ifdef DEBUG_REGEXP_GRAPH
1193static void
1194xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1195 int i;
1196
1197 fprintf(output, " ctxt: ");
1198 if (ctxt == NULL) {
1199 fprintf(output, "NULL\n");
1200 return;
1201 }
1202 fprintf(output, "'%s' ", ctxt->string);
1203 if (ctxt->error)
1204 fprintf(output, "error ");
1205 if (ctxt->neg)
1206 fprintf(output, "neg ");
1207 fprintf(output, "\n");
1208 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1209 for (i = 0;i < ctxt->nbAtoms; i++) {
1210 fprintf(output, " %02d ", i);
1211 xmlRegPrintAtom(output, ctxt->atoms[i]);
1212 }
1213 if (ctxt->atom != NULL) {
1214 fprintf(output, "current atom:\n");
1215 xmlRegPrintAtom(output, ctxt->atom);
1216 }
1217 fprintf(output, "%d states:", ctxt->nbStates);
1218 if (ctxt->start != NULL)
1219 fprintf(output, " start: %d", ctxt->start->no);
1220 if (ctxt->end != NULL)
1221 fprintf(output, " end: %d", ctxt->end->no);
1222 fprintf(output, "\n");
1223 for (i = 0;i < ctxt->nbStates; i++) {
1224 xmlRegPrintState(output, ctxt->states[i]);
1225 }
1226 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1227 for (i = 0;i < ctxt->nbCounters; i++) {
1228 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1229 ctxt->counters[i].max);
1230 }
1231}
1232#endif
1233
1234/************************************************************************
1235 * *
1236 * Finite Automata structures manipulations *
1237 * *
1238 ************************************************************************/
1239
1240static void
1241xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1242 int neg, xmlRegAtomType type, int start, int end,
1243 xmlChar *blockName) {
1244 xmlRegRangePtr range;
1245
1246 if (atom == NULL) {
1247 ERROR("add range: atom is NULL");
1248 return;
1249 }
1250 if (atom->type != XML_REGEXP_RANGES) {
1251 ERROR("add range: atom is not ranges");
1252 return;
1253 }
1254 if (atom->maxRanges == 0) {
1255 atom->maxRanges = 4;
1256 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1257 sizeof(xmlRegRangePtr));
1258 if (atom->ranges == NULL) {
1259 xmlRegexpErrMemory(ctxt, "adding ranges");
1260 atom->maxRanges = 0;
1261 return;
1262 }
1263 } else if (atom->nbRanges >= atom->maxRanges) {
1264 xmlRegRangePtr *tmp;
1265 atom->maxRanges *= 2;
1266 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1267 sizeof(xmlRegRangePtr));
1268 if (tmp == NULL) {
1269 xmlRegexpErrMemory(ctxt, "adding ranges");
1270 atom->maxRanges /= 2;
1271 return;
1272 }
1273 atom->ranges = tmp;
1274 }
1275 range = xmlRegNewRange(ctxt, neg, type, start, end);
1276 if (range == NULL)
1277 return;
1278 range->blockName = blockName;
1279 atom->ranges[atom->nbRanges++] = range;
1280
1281}
1282
1283static int
1284xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1285 if (ctxt->maxCounters == 0) {
1286 ctxt->maxCounters = 4;
1287 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1288 sizeof(xmlRegCounter));
1289 if (ctxt->counters == NULL) {
1290 xmlRegexpErrMemory(ctxt, "allocating counter");
1291 ctxt->maxCounters = 0;
1292 return(-1);
1293 }
1294 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1295 xmlRegCounter *tmp;
1296 ctxt->maxCounters *= 2;
1297 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1298 sizeof(xmlRegCounter));
1299 if (tmp == NULL) {
1300 xmlRegexpErrMemory(ctxt, "allocating counter");
1301 ctxt->maxCounters /= 2;
1302 return(-1);
1303 }
1304 ctxt->counters = tmp;
1305 }
1306 ctxt->counters[ctxt->nbCounters].min = -1;
1307 ctxt->counters[ctxt->nbCounters].max = -1;
1308 return(ctxt->nbCounters++);
1309}
1310
1311static int
1312xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1313 if (atom == NULL) {
1314 ERROR("atom push: atom is NULL");
1315 return(-1);
1316 }
1317 if (ctxt->maxAtoms == 0) {
1318 ctxt->maxAtoms = 4;
1319 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1320 sizeof(xmlRegAtomPtr));
1321 if (ctxt->atoms == NULL) {
1322 xmlRegexpErrMemory(ctxt, "pushing atom");
1323 ctxt->maxAtoms = 0;
1324 return(-1);
1325 }
1326 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1327 xmlRegAtomPtr *tmp;
1328 ctxt->maxAtoms *= 2;
1329 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1330 sizeof(xmlRegAtomPtr));
1331 if (tmp == NULL) {
1332 xmlRegexpErrMemory(ctxt, "allocating counter");
1333 ctxt->maxAtoms /= 2;
1334 return(-1);
1335 }
1336 ctxt->atoms = tmp;
1337 }
1338 atom->no = ctxt->nbAtoms;
1339 ctxt->atoms[ctxt->nbAtoms++] = atom;
1340 return(0);
1341}
1342
1343static void
1344xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1345 int from) {
1346 if (target->maxTransTo == 0) {
1347 target->maxTransTo = 8;
1348 target->transTo = (int *) xmlMalloc(target->maxTransTo *
1349 sizeof(int));
1350 if (target->transTo == NULL) {
1351 xmlRegexpErrMemory(ctxt, "adding transition");
1352 target->maxTransTo = 0;
1353 return;
1354 }
1355 } else if (target->nbTransTo >= target->maxTransTo) {
1356 int *tmp;
1357 target->maxTransTo *= 2;
1358 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1359 sizeof(int));
1360 if (tmp == NULL) {
1361 xmlRegexpErrMemory(ctxt, "adding transition");
1362 target->maxTransTo /= 2;
1363 return;
1364 }
1365 target->transTo = tmp;
1366 }
1367 target->transTo[target->nbTransTo] = from;
1368 target->nbTransTo++;
1369}
1370
1371static void
1372xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1373 xmlRegAtomPtr atom, xmlRegStatePtr target,
1374 int counter, int count) {
1375
1376 int nrtrans;
1377
1378 if (state == NULL) {
1379 ERROR("add state: state is NULL");
1380 return;
1381 }
1382 if (target == NULL) {
1383 ERROR("add state: target is NULL");
1384 return;
1385 }
1386 /*
1387 * Other routines follow the philosophy 'When in doubt, add a transition'
1388 * so we check here whether such a transition is already present and, if
1389 * so, silently ignore this request.
1390 */
1391
1392 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1393 xmlRegTransPtr trans = &(state->trans[nrtrans]);
1394 if ((trans->atom == atom) &&
1395 (trans->to == target->no) &&
1396 (trans->counter == counter) &&
1397 (trans->count == count)) {
1398#ifdef DEBUG_REGEXP_GRAPH
1399 printf("Ignoring duplicate transition from %d to %d\n",
1400 state->no, target->no);
1401#endif
1402 return;
1403 }
1404 }
1405
1406 if (state->maxTrans == 0) {
1407 state->maxTrans = 8;
1408 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1409 sizeof(xmlRegTrans));
1410 if (state->trans == NULL) {
1411 xmlRegexpErrMemory(ctxt, "adding transition");
1412 state->maxTrans = 0;
1413 return;
1414 }
1415 } else if (state->nbTrans >= state->maxTrans) {
1416 xmlRegTrans *tmp;
1417 state->maxTrans *= 2;
1418 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1419 sizeof(xmlRegTrans));
1420 if (tmp == NULL) {
1421 xmlRegexpErrMemory(ctxt, "adding transition");
1422 state->maxTrans /= 2;
1423 return;
1424 }
1425 state->trans = tmp;
1426 }
1427#ifdef DEBUG_REGEXP_GRAPH
1428 printf("Add trans from %d to %d ", state->no, target->no);
1429 if (count == REGEXP_ALL_COUNTER)
1430 printf("all transition\n");
1431 else if (count >= 0)
1432 printf("count based %d\n", count);
1433 else if (counter >= 0)
1434 printf("counted %d\n", counter);
1435 else if (atom == NULL)
1436 printf("epsilon transition\n");
1437 else if (atom != NULL)
1438 xmlRegPrintAtom(stdout, atom);
1439#endif
1440
1441 state->trans[state->nbTrans].atom = atom;
1442 state->trans[state->nbTrans].to = target->no;
1443 state->trans[state->nbTrans].counter = counter;
1444 state->trans[state->nbTrans].count = count;
1445 state->trans[state->nbTrans].nd = 0;
1446 state->nbTrans++;
1447 xmlRegStateAddTransTo(ctxt, target, state->no);
1448}
1449
1450static int
1451xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1452 if (state == NULL) return(-1);
1453 if (ctxt->maxStates == 0) {
1454 ctxt->maxStates = 4;
1455 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1456 sizeof(xmlRegStatePtr));
1457 if (ctxt->states == NULL) {
1458 xmlRegexpErrMemory(ctxt, "adding state");
1459 ctxt->maxStates = 0;
1460 return(-1);
1461 }
1462 } else if (ctxt->nbStates >= ctxt->maxStates) {
1463 xmlRegStatePtr *tmp;
1464 ctxt->maxStates *= 2;
1465 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1466 sizeof(xmlRegStatePtr));
1467 if (tmp == NULL) {
1468 xmlRegexpErrMemory(ctxt, "adding state");
1469 ctxt->maxStates /= 2;
1470 return(-1);
1471 }
1472 ctxt->states = tmp;
1473 }
1474 state->no = ctxt->nbStates;
1475 ctxt->states[ctxt->nbStates++] = state;
1476 return(0);
1477}
1478
1487static void
1488xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1489 xmlRegStatePtr from, xmlRegStatePtr to,
1490 int lax) {
1491 if (to == NULL) {
1492 to = xmlRegNewState(ctxt);
1493 xmlRegStatePush(ctxt, to);
1494 ctxt->state = to;
1495 }
1496 if (lax)
1497 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1498 else
1499 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1500}
1501
1509static void
1510xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1511 xmlRegStatePtr from, xmlRegStatePtr to) {
1512 if (to == NULL) {
1513 to = xmlRegNewState(ctxt);
1514 xmlRegStatePush(ctxt, to);
1515 ctxt->state = to;
1516 }
1517 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1518}
1519
1528static void
1529xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1530 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1531 if (to == NULL) {
1532 to = xmlRegNewState(ctxt);
1533 xmlRegStatePush(ctxt, to);
1534 ctxt->state = to;
1535 }
1536 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1537}
1538
1547static void
1548xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1549 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1550 if (to == NULL) {
1551 to = xmlRegNewState(ctxt);
1552 xmlRegStatePush(ctxt, to);
1553 ctxt->state = to;
1554 }
1555 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1556}
1557
1567static int
1568xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1569 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1570 xmlRegStatePtr end;
1571 int nullable = 0;
1572
1573 if (atom == NULL) {
1574 ERROR("generate transition: atom == NULL");
1575 return(-1);
1576 }
1577 if (atom->type == XML_REGEXP_SUBREG) {
1578 /*
1579 * this is a subexpression handling one should not need to
1580 * create a new node except for XML_REGEXP_QUANT_RANGE.
1581 */
1582 if (xmlRegAtomPush(ctxt, atom) < 0) {
1583 return(-1);
1584 }
1585 if ((to != NULL) && (atom->stop != to) &&
1586 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1587 /*
1588 * Generate an epsilon transition to link to the target
1589 */
1590 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1591#ifdef DV
1592 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1593 (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1594 to = xmlRegNewState(ctxt);
1595 xmlRegStatePush(ctxt, to);
1596 ctxt->state = to;
1597 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1598#endif
1599 }
1600 switch (atom->quant) {
1601 case XML_REGEXP_QUANT_OPT:
1602 atom->quant = XML_REGEXP_QUANT_ONCE;
1603 /*
1604 * transition done to the state after end of atom.
1605 * 1. set transition from atom start to new state
1606 * 2. set transition from atom end to this state.
1607 */
1608 if (to == NULL) {
1609 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1610 xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1611 ctxt->state);
1612 } else {
1613 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1614 }
1615 break;
1616 case XML_REGEXP_QUANT_MULT:
1617 atom->quant = XML_REGEXP_QUANT_ONCE;
1618 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1619 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1620 break;
1621 case XML_REGEXP_QUANT_PLUS:
1622 atom->quant = XML_REGEXP_QUANT_ONCE;
1623 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1624 break;
1625 case XML_REGEXP_QUANT_RANGE: {
1626 int counter;
1627 xmlRegStatePtr inter, newstate;
1628
1629 /*
1630 * create the final state now if needed
1631 */
1632 if (to != NULL) {
1633 newstate = to;
1634 } else {
1635 newstate = xmlRegNewState(ctxt);
1636 xmlRegStatePush(ctxt, newstate);
1637 }
1638
1639 /*
1640 * The principle here is to use counted transition
1641 * to avoid explosion in the number of states in the
1642 * graph. This is clearly more complex but should not
1643 * be exploitable at runtime.
1644 */
1645 if ((atom->min == 0) && (atom->start0 == NULL)) {
1646 xmlRegAtomPtr copy;
1647 /*
1648 * duplicate a transition based on atom to count next
1649 * occurrences after 1. We cannot loop to atom->start
1650 * directly because we need an epsilon transition to
1651 * newstate.
1652 */
1653 /* ???? For some reason it seems we never reach that
1654 case, I suppose this got optimized out before when
1655 building the automata */
1656 copy = xmlRegCopyAtom(ctxt, atom);
1657 if (copy == NULL)
1658 return(-1);
1659 copy->quant = XML_REGEXP_QUANT_ONCE;
1660 copy->min = 0;
1661 copy->max = 0;
1662
1663 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1664 < 0)
1665 return(-1);
1666 inter = ctxt->state;
1667 counter = xmlRegGetCounter(ctxt);
1668 ctxt->counters[counter].min = atom->min - 1;
1669 ctxt->counters[counter].max = atom->max - 1;
1670 /* count the number of times we see it again */
1671 xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1672 atom->stop, counter);
1673 /* allow a way out based on the count */
1674 xmlFAGenerateCountedTransition(ctxt, inter,
1675 newstate, counter);
1676 /* and also allow a direct exit for 0 */
1677 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1678 newstate);
1679 } else {
1680 /*
1681 * either we need the atom at least once or there
1682 * is an atom->start0 allowing to easily plug the
1683 * epsilon transition.
1684 */
1685 counter = xmlRegGetCounter(ctxt);
1686 ctxt->counters[counter].min = atom->min - 1;
1687 ctxt->counters[counter].max = atom->max - 1;
1688 /* allow a way out based on the count */
1689 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1690 newstate, counter);
1691 /* count the number of times we see it again */
1692 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1693 atom->start, counter);
1694 /* and if needed allow a direct exit for 0 */
1695 if (atom->min == 0)
1696 xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1697 newstate);
1698
1699 }
1700 atom->min = 0;
1701 atom->max = 0;
1702 atom->quant = XML_REGEXP_QUANT_ONCE;
1703 ctxt->state = newstate;
1704 }
1705 default:
1706 break;
1707 }
1708 return(0);
1709 }
1710 if ((atom->min == 0) && (atom->max == 0) &&
1711 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1712 /*
1713 * we can discard the atom and generate an epsilon transition instead
1714 */
1715 if (to == NULL) {
1716 to = xmlRegNewState(ctxt);
1717 if (to != NULL)
1718 xmlRegStatePush(ctxt, to);
1719 else {
1720 return(-1);
1721 }
1722 }
1723 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1724 ctxt->state = to;
1725 xmlRegFreeAtom(atom);
1726 return(0);
1727 }
1728 if (to == NULL) {
1729 to = xmlRegNewState(ctxt);
1730 if (to != NULL)
1731 xmlRegStatePush(ctxt, to);
1732 else {
1733 return(-1);
1734 }
1735 }
1736 end = to;
1737 if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1738 (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1739 /*
1740 * Do not pollute the target state by adding transitions from
1741 * it as it is likely to be the shared target of multiple branches.
1742 * So isolate with an epsilon transition.
1743 */
1744 xmlRegStatePtr tmp;
1745
1746 tmp = xmlRegNewState(ctxt);
1747 if (tmp != NULL)
1748 xmlRegStatePush(ctxt, tmp);
1749 else {
1750 return(-1);
1751 }
1752 xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1753 to = tmp;
1754 }
1755 if (xmlRegAtomPush(ctxt, atom) < 0) {
1756 return(-1);
1757 }
1758 if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1759 (atom->min == 0) && (atom->max > 0)) {
1760 nullable = 1;
1761 atom->min = 1;
1762 if (atom->max == 1)
1763 atom->quant = XML_REGEXP_QUANT_OPT;
1764 }
1765 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1766 ctxt->state = end;
1767 switch (atom->quant) {
1768 case XML_REGEXP_QUANT_OPT:
1769 atom->quant = XML_REGEXP_QUANT_ONCE;
1770 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1771 break;
1772 case XML_REGEXP_QUANT_MULT:
1773 atom->quant = XML_REGEXP_QUANT_ONCE;
1774 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1775 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1776 break;
1777 case XML_REGEXP_QUANT_PLUS:
1778 atom->quant = XML_REGEXP_QUANT_ONCE;
1779 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1780 break;
1781 case XML_REGEXP_QUANT_RANGE:
1782 if (nullable)
1783 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1784 break;
1785 default:
1786 break;
1787 }
1788 return(0);
1789}
1790
1799static void
1800xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1801 int tonr, int counter) {
1802 int transnr;
1803 xmlRegStatePtr from;
1804 xmlRegStatePtr to;
1805
1806#ifdef DEBUG_REGEXP_GRAPH
1807 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1808#endif
1809 from = ctxt->states[fromnr];
1810 if (from == NULL)
1811 return;
1812 to = ctxt->states[tonr];
1813 if (to == NULL)
1814 return;
1815 if ((to->mark == XML_REGEXP_MARK_START) ||
1816 (to->mark == XML_REGEXP_MARK_VISITED))
1817 return;
1818
1819 to->mark = XML_REGEXP_MARK_VISITED;
1820 if (to->type == XML_REGEXP_FINAL_STATE) {
1821#ifdef DEBUG_REGEXP_GRAPH
1822 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1823#endif
1824 from->type = XML_REGEXP_FINAL_STATE;
1825 }
1826 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1827 if (to->trans[transnr].to < 0)
1828 continue;
1829 if (to->trans[transnr].atom == NULL) {
1830 /*
1831 * Don't remove counted transitions
1832 * Don't loop either
1833 */
1834 if (to->trans[transnr].to != fromnr) {
1835 if (to->trans[transnr].count >= 0) {
1836 int newto = to->trans[transnr].to;
1837
1838 xmlRegStateAddTrans(ctxt, from, NULL,
1839 ctxt->states[newto],
1840 -1, to->trans[transnr].count);
1841 } else {
1842#ifdef DEBUG_REGEXP_GRAPH
1843 printf("Found epsilon trans %d from %d to %d\n",
1844 transnr, tonr, to->trans[transnr].to);
1845#endif
1846 if (to->trans[transnr].counter >= 0) {
1847 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1848 to->trans[transnr].to,
1849 to->trans[transnr].counter);
1850 } else {
1851 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1852 to->trans[transnr].to,
1853 counter);
1854 }
1855 }
1856 }
1857 } else {
1858 int newto = to->trans[transnr].to;
1859
1860 if (to->trans[transnr].counter >= 0) {
1861 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1862 ctxt->states[newto],
1863 to->trans[transnr].counter, -1);
1864 } else {
1865 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1866 ctxt->states[newto], counter, -1);
1867 }
1868 }
1869 }
1870 to->mark = XML_REGEXP_MARK_NORMAL;
1871}
1872
1894static void
1895xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1896 int statenr, i, j, newto;
1897 xmlRegStatePtr state, tmp;
1898
1899 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1900 state = ctxt->states[statenr];
1901 if (state == NULL)
1902 continue;
1903 if (state->nbTrans != 1)
1904 continue;
1905 if (state->type == XML_REGEXP_UNREACH_STATE ||
1906 state->type == XML_REGEXP_FINAL_STATE)
1907 continue;
1908 /* is the only transition out a basic transition */
1909 if ((state->trans[0].atom == NULL) &&
1910 (state->trans[0].to >= 0) &&
1911 (state->trans[0].to != statenr) &&
1912 (state->trans[0].counter < 0) &&
1913 (state->trans[0].count < 0)) {
1914 newto = state->trans[0].to;
1915
1916 if (state->type == XML_REGEXP_START_STATE) {
1917#ifdef DEBUG_REGEXP_GRAPH
1918 printf("Found simple epsilon trans from start %d to %d\n",
1919 statenr, newto);
1920#endif
1921 } else {
1922#ifdef DEBUG_REGEXP_GRAPH
1923 printf("Found simple epsilon trans from %d to %d\n",
1924 statenr, newto);
1925#endif
1926 for (i = 0;i < state->nbTransTo;i++) {
1927 tmp = ctxt->states[state->transTo[i]];
1928 for (j = 0;j < tmp->nbTrans;j++) {
1929 if (tmp->trans[j].to == statenr) {
1930#ifdef DEBUG_REGEXP_GRAPH
1931 printf("Changed transition %d on %d to go to %d\n",
1932 j, tmp->no, newto);
1933#endif
1934 tmp->trans[j].to = -1;
1935 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1936 ctxt->states[newto],
1937 tmp->trans[j].counter,
1938 tmp->trans[j].count);
1939 }
1940 }
1941 }
1942 if (state->type == XML_REGEXP_FINAL_STATE)
1943 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1944 /* eliminate the transition completely */
1945 state->nbTrans = 0;
1946
1947 state->type = XML_REGEXP_UNREACH_STATE;
1948
1949 }
1950
1951 }
1952 }
1953}
1959static void
1960xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1961 int statenr, transnr;
1962 xmlRegStatePtr state;
1963 int has_epsilon;
1964
1965 if (ctxt->states == NULL) return;
1966
1967 /*
1968 * Eliminate simple epsilon transition and the associated unreachable
1969 * states.
1970 */
1971 xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1972 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1973 state = ctxt->states[statenr];
1974 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1975#ifdef DEBUG_REGEXP_GRAPH
1976 printf("Removed unreachable state %d\n", statenr);
1977#endif
1978 xmlRegFreeState(state);
1979 ctxt->states[statenr] = NULL;
1980 }
1981 }
1982
1983 has_epsilon = 0;
1984
1985 /*
1986 * Build the completed transitions bypassing the epsilons
1987 * Use a marking algorithm to avoid loops
1988 * Mark sink states too.
1989 * Process from the latest states backward to the start when
1990 * there is long cascading epsilon chains this minimize the
1991 * recursions and transition compares when adding the new ones
1992 */
1993 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1994 state = ctxt->states[statenr];
1995 if (state == NULL)
1996 continue;
1997 if ((state->nbTrans == 0) &&
1998 (state->type != XML_REGEXP_FINAL_STATE)) {
1999 state->type = XML_REGEXP_SINK_STATE;
2000 }
2001 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2002 if ((state->trans[transnr].atom == NULL) &&
2003 (state->trans[transnr].to >= 0)) {
2004 if (state->trans[transnr].to == statenr) {
2005 state->trans[transnr].to = -1;
2006#ifdef DEBUG_REGEXP_GRAPH
2007 printf("Removed loopback epsilon trans %d on %d\n",
2008 transnr, statenr);
2009#endif
2010 } else if (state->trans[transnr].count < 0) {
2011 int newto = state->trans[transnr].to;
2012
2013#ifdef DEBUG_REGEXP_GRAPH
2014 printf("Found epsilon trans %d from %d to %d\n",
2015 transnr, statenr, newto);
2016#endif
2017 has_epsilon = 1;
2018 state->trans[transnr].to = -2;
2019 state->mark = XML_REGEXP_MARK_START;
2020 xmlFAReduceEpsilonTransitions(ctxt, statenr,
2021 newto, state->trans[transnr].counter);
2022 state->mark = XML_REGEXP_MARK_NORMAL;
2023#ifdef DEBUG_REGEXP_GRAPH
2024 } else {
2025 printf("Found counted transition %d on %d\n",
2026 transnr, statenr);
2027#endif
2028 }
2029 }
2030 }
2031 }
2032 /*
2033 * Eliminate the epsilon transitions
2034 */
2035 if (has_epsilon) {
2036 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2037 state = ctxt->states[statenr];
2038 if (state == NULL)
2039 continue;
2040 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2041 xmlRegTransPtr trans = &(state->trans[transnr]);
2042 if ((trans->atom == NULL) &&
2043 (trans->count < 0) &&
2044 (trans->to >= 0)) {
2045 trans->to = -1;
2046 }
2047 }
2048 }
2049 }
2050
2051 /*
2052 * Use this pass to detect unreachable states too
2053 */
2054 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2055 state = ctxt->states[statenr];
2056 if (state != NULL)
2057 state->reached = XML_REGEXP_MARK_NORMAL;
2058 }
2059 state = ctxt->states[0];
2060 if (state != NULL)
2061 state->reached = XML_REGEXP_MARK_START;
2062 while (state != NULL) {
2063 xmlRegStatePtr target = NULL;
2064 state->reached = XML_REGEXP_MARK_VISITED;
2065 /*
2066 * Mark all states reachable from the current reachable state
2067 */
2068 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2069 if ((state->trans[transnr].to >= 0) &&
2070 ((state->trans[transnr].atom != NULL) ||
2071 (state->trans[transnr].count >= 0))) {
2072 int newto = state->trans[transnr].to;
2073
2074 if (ctxt->states[newto] == NULL)
2075 continue;
2076 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2077 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
2078 target = ctxt->states[newto];
2079 }
2080 }
2081 }
2082
2083 /*
2084 * find the next accessible state not explored
2085 */
2086 if (target == NULL) {
2087 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2088 state = ctxt->states[statenr];
2089 if ((state != NULL) && (state->reached ==
2090 XML_REGEXP_MARK_START)) {
2091 target = state;
2092 break;
2093 }
2094 }
2095 }
2096 state = target;
2097 }
2098 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2099 state = ctxt->states[statenr];
2100 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2101#ifdef DEBUG_REGEXP_GRAPH
2102 printf("Removed unreachable state %d\n", statenr);
2103#endif
2104 xmlRegFreeState(state);
2105 ctxt->states[statenr] = NULL;
2106 }
2107 }
2108
2109}
2110
2111static int
2112xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2113 int ret = 0;
2114
2115 if ((range1->type == XML_REGEXP_RANGES) ||
2116 (range2->type == XML_REGEXP_RANGES) ||
2117 (range2->type == XML_REGEXP_SUBREG) ||
2118 (range1->type == XML_REGEXP_SUBREG) ||
2119 (range1->type == XML_REGEXP_STRING) ||
2120 (range2->type == XML_REGEXP_STRING))
2121 return(-1);
2122
2123 /* put them in order */
2124 if (range1->type > range2->type) {
2125 xmlRegRangePtr tmp;
2126
2127 tmp = range1;
2128 range1 = range2;
2129 range2 = tmp;
2130 }
2131 if ((range1->type == XML_REGEXP_ANYCHAR) ||
2132 (range2->type == XML_REGEXP_ANYCHAR)) {
2133 ret = 1;
2134 } else if ((range1->type == XML_REGEXP_EPSILON) ||
2135 (range2->type == XML_REGEXP_EPSILON)) {
2136 return(0);
2137 } else if (range1->type == range2->type) {
2138 if (range1->type != XML_REGEXP_CHARVAL)
2139 ret = 1;
2140 else if ((range1->end < range2->start) ||
2141 (range2->end < range1->start))
2142 ret = 0;
2143 else
2144 ret = 1;
2145 } else if (range1->type == XML_REGEXP_CHARVAL) {
2146 int codepoint;
2147 int neg = 0;
2148
2149 /*
2150 * just check all codepoints in the range for acceptance,
2151 * this is usually way cheaper since done only once at
2152 * compilation than testing over and over at runtime or
2153 * pushing too many states when evaluating.
2154 */
2155 if (((range1->neg == 0) && (range2->neg != 0)) ||
2156 ((range1->neg != 0) && (range2->neg == 0)))
2157 neg = 1;
2158
2159 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2160 ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2161 0, range2->start, range2->end,
2162 range2->blockName);
2163 if (ret < 0)
2164 return(-1);
2165 if (((neg == 1) && (ret == 0)) ||
2166 ((neg == 0) && (ret == 1)))
2167 return(1);
2168 }
2169 return(0);
2170 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2171 (range2->type == XML_REGEXP_BLOCK_NAME)) {
2172 if (range1->type == range2->type) {
2173 ret = xmlStrEqual(range1->blockName, range2->blockName);
2174 } else {
2175 /*
2176 * comparing a block range with anything else is way
2177 * too costly, and maintaining the table is like too much
2178 * memory too, so let's force the automata to save state
2179 * here.
2180 */
2181 return(1);
2182 }
2183 } else if ((range1->type < XML_REGEXP_LETTER) ||
2184 (range2->type < XML_REGEXP_LETTER)) {
2185 if ((range1->type == XML_REGEXP_ANYSPACE) &&
2186 (range2->type == XML_REGEXP_NOTSPACE))
2187 ret = 0;
2188 else if ((range1->type == XML_REGEXP_INITNAME) &&
2189 (range2->type == XML_REGEXP_NOTINITNAME))
2190 ret = 0;
2191 else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2192 (range2->type == XML_REGEXP_NOTNAMECHAR))
2193 ret = 0;
2194 else if ((range1->type == XML_REGEXP_DECIMAL) &&
2195 (range2->type == XML_REGEXP_NOTDECIMAL))
2196 ret = 0;
2197 else if ((range1->type == XML_REGEXP_REALCHAR) &&
2198 (range2->type == XML_REGEXP_NOTREALCHAR))
2199 ret = 0;
2200 else {
2201 /* same thing to limit complexity */
2202 return(1);
2203 }
2204 } else {
2205 ret = 0;
2206 /* range1->type < range2->type here */
2207 switch (range1->type) {
2208 case XML_REGEXP_LETTER:
2209 /* all disjoint except in the subgroups */
2210 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2211 (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2212 (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2213 (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2214 (range2->type == XML_REGEXP_LETTER_OTHERS))
2215 ret = 1;
2216 break;
2217 case XML_REGEXP_MARK:
2218 if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2219 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2220 (range2->type == XML_REGEXP_MARK_ENCLOSING))
2221 ret = 1;
2222 break;
2223 case XML_REGEXP_NUMBER:
2224 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2225 (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2226 (range2->type == XML_REGEXP_NUMBER_OTHERS))
2227 ret = 1;
2228 break;
2229 case XML_REGEXP_PUNCT:
2230 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2231 (range2->type == XML_REGEXP_PUNCT_DASH) ||
2232 (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2233 (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2234 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2235 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2236 (range2->type == XML_REGEXP_PUNCT_OTHERS))
2237 ret = 1;
2238 break;
2239 case XML_REGEXP_SEPAR:
2240 if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2241 (range2->type == XML_REGEXP_SEPAR_LINE) ||
2242 (range2->type == XML_REGEXP_SEPAR_PARA))
2243 ret = 1;
2244 break;
2245 case XML_REGEXP_SYMBOL:
2246 if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2247 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2248 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2249 (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2250 ret = 1;
2251 break;
2252 case XML_REGEXP_OTHER:
2253 if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2254 (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2255 (range2->type == XML_REGEXP_OTHER_PRIVATE))
2256 ret = 1;
2257 break;
2258 default:
2259 if ((range2->type >= XML_REGEXP_LETTER) &&
2260 (range2->type < XML_REGEXP_BLOCK_NAME))
2261 ret = 0;
2262 else {
2263 /* safety net ! */
2264 return(1);
2265 }
2266 }
2267 }
2268 if (((range1->neg == 0) && (range2->neg != 0)) ||
2269 ((range1->neg != 0) && (range2->neg == 0)))
2270 ret = !ret;
2271 return(ret);
2272}
2273
2284static int
2285xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2286 if ((type1 == XML_REGEXP_EPSILON) ||
2287 (type1 == XML_REGEXP_CHARVAL) ||
2288 (type1 == XML_REGEXP_RANGES) ||
2289 (type1 == XML_REGEXP_SUBREG) ||
2290 (type1 == XML_REGEXP_STRING) ||
2291 (type1 == XML_REGEXP_ANYCHAR))
2292 return(1);
2293 if ((type2 == XML_REGEXP_EPSILON) ||
2294 (type2 == XML_REGEXP_CHARVAL) ||
2295 (type2 == XML_REGEXP_RANGES) ||
2296 (type2 == XML_REGEXP_SUBREG) ||
2297 (type2 == XML_REGEXP_STRING) ||
2298 (type2 == XML_REGEXP_ANYCHAR))
2299 return(1);
2300
2301 if (type1 == type2) return(1);
2302
2303 /* simplify subsequent compares by making sure type1 < type2 */
2304 if (type1 > type2) {
2305 xmlRegAtomType tmp = type1;
2306 type1 = type2;
2307 type2 = tmp;
2308 }
2309 switch (type1) {
2310 case XML_REGEXP_ANYSPACE: /* \s */
2311 /* can't be a letter, number, mark, punctuation, symbol */
2312 if ((type2 == XML_REGEXP_NOTSPACE) ||
2313 ((type2 >= XML_REGEXP_LETTER) &&
2314 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2315 ((type2 >= XML_REGEXP_NUMBER) &&
2316 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2317 ((type2 >= XML_REGEXP_MARK) &&
2318 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2319 ((type2 >= XML_REGEXP_PUNCT) &&
2320 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2321 ((type2 >= XML_REGEXP_SYMBOL) &&
2322 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2323 ) return(0);
2324 break;
2325 case XML_REGEXP_NOTSPACE: /* \S */
2326 break;
2327 case XML_REGEXP_INITNAME: /* \l */
2328 /* can't be a number, mark, separator, punctuation, symbol or other */
2329 if ((type2 == XML_REGEXP_NOTINITNAME) ||
2330 ((type2 >= XML_REGEXP_NUMBER) &&
2331 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2332 ((type2 >= XML_REGEXP_MARK) &&
2333 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2334 ((type2 >= XML_REGEXP_SEPAR) &&
2335 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2336 ((type2 >= XML_REGEXP_PUNCT) &&
2337 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2338 ((type2 >= XML_REGEXP_SYMBOL) &&
2339 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2340 ((type2 >= XML_REGEXP_OTHER) &&
2341 (type2 <= XML_REGEXP_OTHER_NA))
2342 ) return(0);
2343 break;
2344 case XML_REGEXP_NOTINITNAME: /* \L */
2345 break;
2346 case XML_REGEXP_NAMECHAR: /* \c */
2347 /* can't be a mark, separator, punctuation, symbol or other */
2348 if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2349 ((type2 >= XML_REGEXP_MARK) &&
2350 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2351 ((type2 >= XML_REGEXP_PUNCT) &&
2352 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2353 ((type2 >= XML_REGEXP_SEPAR) &&
2354 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2355 ((type2 >= XML_REGEXP_SYMBOL) &&
2356 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2357 ((type2 >= XML_REGEXP_OTHER) &&
2358 (type2 <= XML_REGEXP_OTHER_NA))
2359 ) return(0);
2360 break;
2361 case XML_REGEXP_NOTNAMECHAR: /* \C */
2362 break;
2363 case XML_REGEXP_DECIMAL: /* \d */
2364 /* can't be a letter, mark, separator, punctuation, symbol or other */
2365 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2366 (type2 == XML_REGEXP_REALCHAR) ||
2367 ((type2 >= XML_REGEXP_LETTER) &&
2368 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2369 ((type2 >= XML_REGEXP_MARK) &&
2370 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2371 ((type2 >= XML_REGEXP_PUNCT) &&
2372 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2373 ((type2 >= XML_REGEXP_SEPAR) &&
2374 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2375 ((type2 >= XML_REGEXP_SYMBOL) &&
2376 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2377 ((type2 >= XML_REGEXP_OTHER) &&
2378 (type2 <= XML_REGEXP_OTHER_NA))
2379 )return(0);
2380 break;
2381 case XML_REGEXP_NOTDECIMAL: /* \D */
2382 break;
2383 case XML_REGEXP_REALCHAR: /* \w */
2384 /* can't be a mark, separator, punctuation, symbol or other */
2385 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2386 ((type2 >= XML_REGEXP_MARK) &&
2387 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2388 ((type2 >= XML_REGEXP_PUNCT) &&
2389 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2390 ((type2 >= XML_REGEXP_SEPAR) &&
2391 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2392 ((type2 >= XML_REGEXP_SYMBOL) &&
2393 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2394 ((type2 >= XML_REGEXP_OTHER) &&
2395 (type2 <= XML_REGEXP_OTHER_NA))
2396 )return(0);
2397 break;
2398 case XML_REGEXP_NOTREALCHAR: /* \W */
2399 break;
2400 /*
2401 * at that point we know both type 1 and type2 are from
2402 * character categories are ordered and are different,
2403 * it becomes simple because this is a partition
2404 */
2405 case XML_REGEXP_LETTER:
2406 if (type2 <= XML_REGEXP_LETTER_OTHERS)
2407 return(1);
2408 return(0);
2409 case XML_REGEXP_LETTER_UPPERCASE:
2410 case XML_REGEXP_LETTER_LOWERCASE:
2411 case XML_REGEXP_LETTER_TITLECASE:
2412 case XML_REGEXP_LETTER_MODIFIER:
2413 case XML_REGEXP_LETTER_OTHERS:
2414 return(0);
2415 case XML_REGEXP_MARK:
2416 if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2417 return(1);
2418 return(0);
2419 case XML_REGEXP_MARK_NONSPACING:
2420 case XML_REGEXP_MARK_SPACECOMBINING:
2421 case XML_REGEXP_MARK_ENCLOSING:
2422 return(0);
2423 case XML_REGEXP_NUMBER:
2424 if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2425 return(1);
2426 return(0);
2427 case XML_REGEXP_NUMBER_DECIMAL:
2428 case XML_REGEXP_NUMBER_LETTER:
2429 case XML_REGEXP_NUMBER_OTHERS:
2430 return(0);
2431 case XML_REGEXP_PUNCT:
2432 if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2433 return(1);
2434 return(0);
2435 case XML_REGEXP_PUNCT_CONNECTOR:
2436 case XML_REGEXP_PUNCT_DASH:
2437 case XML_REGEXP_PUNCT_OPEN:
2438 case XML_REGEXP_PUNCT_CLOSE:
2439 case XML_REGEXP_PUNCT_INITQUOTE:
2440 case XML_REGEXP_PUNCT_FINQUOTE:
2441 case XML_REGEXP_PUNCT_OTHERS:
2442 return(0);
2443 case XML_REGEXP_SEPAR:
2444 if (type2 <= XML_REGEXP_SEPAR_PARA)
2445 return(1);
2446 return(0);
2447 case XML_REGEXP_SEPAR_SPACE:
2448 case XML_REGEXP_SEPAR_LINE:
2449 case XML_REGEXP_SEPAR_PARA:
2450 return(0);
2451 case XML_REGEXP_SYMBOL:
2452 if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2453 return(1);
2454 return(0);
2455 case XML_REGEXP_SYMBOL_MATH:
2456 case XML_REGEXP_SYMBOL_CURRENCY:
2457 case XML_REGEXP_SYMBOL_MODIFIER:
2458 case XML_REGEXP_SYMBOL_OTHERS:
2459 return(0);
2460 case XML_REGEXP_OTHER:
2461 if (type2 <= XML_REGEXP_OTHER_NA)
2462 return(1);
2463 return(0);
2464 case XML_REGEXP_OTHER_CONTROL:
2465 case XML_REGEXP_OTHER_FORMAT:
2466 case XML_REGEXP_OTHER_PRIVATE:
2467 case XML_REGEXP_OTHER_NA:
2468 return(0);
2469 default:
2470 break;
2471 }
2472 return(1);
2473}
2474
2486static int
2487xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2488 int ret = 0;
2489
2490 if (atom1 == atom2)
2491 return(1);
2492 if ((atom1 == NULL) || (atom2 == NULL))
2493 return(0);
2494
2495 if (atom1->type != atom2->type)
2496 return(0);
2497 switch (atom1->type) {
2498 case XML_REGEXP_EPSILON:
2499 ret = 0;
2500 break;
2501 case XML_REGEXP_STRING:
2502 if (!deep)
2503 ret = (atom1->valuep == atom2->valuep);
2504 else
2505 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2506 (xmlChar *)atom2->valuep);
2507 break;
2508 case XML_REGEXP_CHARVAL:
2509 ret = (atom1->codepoint == atom2->codepoint);
2510 break;
2511 case XML_REGEXP_RANGES:
2512 /* too hard to do in the general case */
2513 ret = 0;
2514 default:
2515 break;
2516 }
2517 return(ret);
2518}
2519
2531static int
2532xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2533 int ret = 1;
2534
2535 if (atom1 == atom2)
2536 return(1);
2537 if ((atom1 == NULL) || (atom2 == NULL))
2538 return(0);
2539
2540 if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2541 (atom2->type == XML_REGEXP_ANYCHAR))
2542 return(1);
2543
2544 if (atom1->type > atom2->type) {
2545 xmlRegAtomPtr tmp;
2546 tmp = atom1;
2547 atom1 = atom2;
2548 atom2 = tmp;
2549 }
2550 if (atom1->type != atom2->type) {
2551 ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2552 /* if they can't intersect at the type level break now */
2553 if (ret == 0)
2554 return(0);
2555 }
2556 switch (atom1->type) {
2557 case XML_REGEXP_STRING:
2558 if (!deep)
2559 ret = (atom1->valuep != atom2->valuep);
2560 else {
2561 xmlChar *val1 = (xmlChar *)atom1->valuep;
2562 xmlChar *val2 = (xmlChar *)atom2->valuep;
2563 int compound1 = (xmlStrchr(val1, '|') != NULL);
2564 int compound2 = (xmlStrchr(val2, '|') != NULL);
2565
2566 /* Ignore negative match flag for ##other namespaces */
2567 if (compound1 != compound2)
2568 return(0);
2569
2570 ret = xmlRegStrEqualWildcard(val1, val2);
2571 }
2572 break;
2573 case XML_REGEXP_EPSILON:
2574 goto not_determinist;
2575 case XML_REGEXP_CHARVAL:
2576 if (atom2->type == XML_REGEXP_CHARVAL) {
2577 ret = (atom1->codepoint == atom2->codepoint);
2578 } else {
2579 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2580 if (ret < 0)
2581 ret = 1;
2582 }
2583 break;
2584 case XML_REGEXP_RANGES:
2585 if (atom2->type == XML_REGEXP_RANGES) {
2586 int i, j, res;
2587 xmlRegRangePtr r1, r2;
2588
2589 /*
2590 * need to check that none of the ranges eventually matches
2591 */
2592 for (i = 0;i < atom1->nbRanges;i++) {
2593 for (j = 0;j < atom2->nbRanges;j++) {
2594 r1 = atom1->ranges[i];
2595 r2 = atom2->ranges[j];
2596 res = xmlFACompareRanges(r1, r2);
2597 if (res == 1) {
2598 ret = 1;
2599 goto done;
2600 }
2601 }
2602 }
2603 ret = 0;
2604 }
2605 break;
2606 default:
2607 goto not_determinist;
2608 }
2609done:
2610 if (atom1->neg != atom2->neg) {
2611 ret = !ret;
2612 }
2613 if (ret == 0)
2614 return(0);
2615not_determinist:
2616 return(1);
2617}
2618
2627static int
2628xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2629 int to, xmlRegAtomPtr atom) {
2630 int ret = 1;
2631 int res;
2632 int transnr, nbTrans;
2633 xmlRegTransPtr t1;
2634 int deep = 1;
2635
2636 if (state == NULL)
2637 return(ret);
2638 if (state->markd == XML_REGEXP_MARK_VISITED)
2639 return(ret);
2640
2641 if (ctxt->flags & AM_AUTOMATA_RNG)
2642 deep = 0;
2643
2644 /*
2645 * don't recurse on transitions potentially added in the course of
2646 * the elimination.
2647 */
2648 nbTrans = state->nbTrans;
2649 for (transnr = 0;transnr < nbTrans;transnr++) {
2650 t1 = &(state->trans[transnr]);
2651 /*
2652 * check transitions conflicting with the one looked at
2653 */
2654 if (t1->atom == NULL) {
2655 if (t1->to < 0)
2656 continue;
2657 state->markd = XML_REGEXP_MARK_VISITED;
2658 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2659 to, atom);
2660 if (res == 0) {
2661 ret = 0;
2662 /* t1->nd = 1; */
2663 }
2664 continue;
2665 }
2666 if (t1->to != to)
2667 continue;
2668 if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2669 ret = 0;
2670 /* mark the transition as non-deterministic */
2671 t1->nd = 1;
2672 }
2673 }
2674 return(ret);
2675}
2676
2683static void
2684xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
2685 int transnr, nbTrans;
2686
2687 if (state == NULL)
2688 return;
2689 if (state->markd != XML_REGEXP_MARK_VISITED)
2690 return;
2691 state->markd = 0;
2692
2693 nbTrans = state->nbTrans;
2694 for (transnr = 0; transnr < nbTrans; transnr++) {
2695 xmlRegTransPtr t1 = &state->trans[transnr];
2696 if ((t1->atom == NULL) && (t1->to >= 0))
2697 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2698 }
2699}
2700
2709static int
2710xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2711 int statenr, transnr;
2712 xmlRegStatePtr state;
2713 xmlRegTransPtr t1, t2, last;
2714 int i;
2715 int ret = 1;
2716 int deep = 1;
2717
2718#ifdef DEBUG_REGEXP_GRAPH
2719 printf("xmlFAComputesDeterminism\n");
2720 xmlRegPrintCtxt(stdout, ctxt);
2721#endif
2722 if (ctxt->determinist != -1)
2723 return(ctxt->determinist);
2724
2725 if (ctxt->flags & AM_AUTOMATA_RNG)
2726 deep = 0;
2727
2728 /*
2729 * First cleanup the automata removing cancelled transitions
2730 */
2731 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2732 state = ctxt->states[statenr];
2733 if (state == NULL)
2734 continue;
2735 if (state->nbTrans < 2)
2736 continue;
2737 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2738 t1 = &(state->trans[transnr]);
2739 /*
2740 * Determinism checks in case of counted or all transitions
2741 * will have to be handled separately
2742 */
2743 if (t1->atom == NULL) {
2744 /* t1->nd = 1; */
2745 continue;
2746 }
2747 if (t1->to == -1) /* eliminated */
2748 continue;
2749 for (i = 0;i < transnr;i++) {
2750 t2 = &(state->trans[i]);
2751 if (t2->to == -1) /* eliminated */
2752 continue;
2753 if (t2->atom != NULL) {
2754 if (t1->to == t2->to) {
2755 /*
2756 * Here we use deep because we want to keep the
2757 * transitions which indicate a conflict
2758 */
2759 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2760 (t1->counter == t2->counter) &&
2761 (t1->count == t2->count))
2762 t2->to = -1; /* eliminated */
2763 }
2764 }
2765 }
2766 }
2767 }
2768
2769 /*
2770 * Check for all states that there aren't 2 transitions
2771 * with the same atom and a different target.
2772 */
2773 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2774 state = ctxt->states[statenr];
2775 if (state == NULL)
2776 continue;
2777 if (state->nbTrans < 2)
2778 continue;
2779 last = NULL;
2780 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2781 t1 = &(state->trans[transnr]);
2782 /*
2783 * Determinism checks in case of counted or all transitions
2784 * will have to be handled separately
2785 */
2786 if (t1->atom == NULL) {
2787 continue;
2788 }
2789 if (t1->to == -1) /* eliminated */
2790 continue;
2791 for (i = 0;i < transnr;i++) {
2792 t2 = &(state->trans[i]);
2793 if (t2->to == -1) /* eliminated */
2794 continue;
2795 if (t2->atom != NULL) {
2796 /*
2797 * But here we don't use deep because we want to
2798 * find transitions which indicate a conflict
2799 */
2800 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2801 ret = 0;
2802 /* mark the transitions as non-deterministic ones */
2803 t1->nd = 1;
2804 t2->nd = 1;
2805 last = t1;
2806 }
2807 } else if (t1->to != -1) {
2808 /*
2809 * do the closure in case of remaining specific
2810 * epsilon transitions like choices or all
2811 */
2812 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2813 t2->to, t2->atom);
2814 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2815 /* don't shortcut the computation so all non deterministic
2816 transition get marked down
2817 if (ret == 0)
2818 return(0);
2819 */
2820 if (ret == 0) {
2821 t1->nd = 1;
2822 /* t2->nd = 1; */
2823 last = t1;
2824 }
2825 }
2826 }
2827 /* don't shortcut the computation so all non deterministic
2828 transition get marked down
2829 if (ret == 0)
2830 break; */
2831 }
2832
2833 /*
2834 * mark specifically the last non-deterministic transition
2835 * from a state since there is no need to set-up rollback
2836 * from it
2837 */
2838 if (last != NULL) {
2839 last->nd = 2;
2840 }
2841
2842 /* don't shortcut the computation so all non deterministic
2843 transition get marked down
2844 if (ret == 0)
2845 break; */
2846 }
2847
2848 ctxt->determinist = ret;
2849 return(ret);
2850}
2851
2852/************************************************************************
2853 * *
2854 * Routines to check input against transition atoms *
2855 * *
2856 ************************************************************************/
2857
2858static int
2859xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2860 int start, int end, const xmlChar *blockName) {
2861 int ret = 0;
2862
2863 switch (type) {
2864 case XML_REGEXP_STRING:
2865 case XML_REGEXP_SUBREG:
2866 case XML_REGEXP_RANGES:
2867 case XML_REGEXP_EPSILON:
2868 return(-1);
2869 case XML_REGEXP_ANYCHAR:
2870 ret = ((codepoint != '\n') && (codepoint != '\r'));
2871 break;
2872 case XML_REGEXP_CHARVAL:
2873 ret = ((codepoint >= start) && (codepoint <= end));
2874 break;
2875 case XML_REGEXP_NOTSPACE:
2876 neg = !neg;
2877 /* Falls through. */
2878 case XML_REGEXP_ANYSPACE:
2879 ret = ((codepoint == '\n') || (codepoint == '\r') ||
2880 (codepoint == '\t') || (codepoint == ' '));
2881 break;
2882 case XML_REGEXP_NOTINITNAME:
2883 neg = !neg;
2884 /* Falls through. */
2885 case XML_REGEXP_INITNAME:
2886 ret = (IS_LETTER(codepoint) ||
2887 (codepoint == '_') || (codepoint == ':'));
2888 break;
2889 case XML_REGEXP_NOTNAMECHAR:
2890 neg = !neg;
2891 /* Falls through. */
2892 case XML_REGEXP_NAMECHAR:
2893 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2894 (codepoint == '.') || (codepoint == '-') ||
2895 (codepoint == '_') || (codepoint == ':') ||
2896 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2897 break;
2898 case XML_REGEXP_NOTDECIMAL:
2899 neg = !neg;
2900 /* Falls through. */
2901 case XML_REGEXP_DECIMAL:
2902 ret = xmlUCSIsCatNd(codepoint);
2903 break;
2904 case XML_REGEXP_REALCHAR:
2905 neg = !neg;
2906 /* Falls through. */
2907 case XML_REGEXP_NOTREALCHAR:
2908 ret = xmlUCSIsCatP(codepoint);
2909 if (ret == 0)
2910 ret = xmlUCSIsCatZ(codepoint);
2911 if (ret == 0)
2912 ret = xmlUCSIsCatC(codepoint);
2913 break;
2914 case XML_REGEXP_LETTER:
2915 ret = xmlUCSIsCatL(codepoint);
2916 break;
2917 case XML_REGEXP_LETTER_UPPERCASE:
2918 ret = xmlUCSIsCatLu(codepoint);
2919 break;
2920 case XML_REGEXP_LETTER_LOWERCASE:
2921 ret = xmlUCSIsCatLl(codepoint);
2922 break;
2923 case XML_REGEXP_LETTER_TITLECASE:
2924 ret = xmlUCSIsCatLt(codepoint);
2925 break;
2926 case XML_REGEXP_LETTER_MODIFIER:
2927 ret = xmlUCSIsCatLm(codepoint);
2928 break;
2929 case XML_REGEXP_LETTER_OTHERS:
2930 ret = xmlUCSIsCatLo(codepoint);
2931 break;
2932 case XML_REGEXP_MARK:
2933 ret = xmlUCSIsCatM(codepoint);
2934 break;
2935 case XML_REGEXP_MARK_NONSPACING:
2936 ret = xmlUCSIsCatMn(codepoint);
2937 break;
2938 case XML_REGEXP_MARK_SPACECOMBINING:
2939 ret = xmlUCSIsCatMc(codepoint);
2940 break;
2941 case XML_REGEXP_MARK_ENCLOSING:
2942 ret = xmlUCSIsCatMe(codepoint);
2943 break;
2944 case XML_REGEXP_NUMBER:
2945 ret = xmlUCSIsCatN(codepoint);
2946 break;
2947 case XML_REGEXP_NUMBER_DECIMAL:
2948 ret = xmlUCSIsCatNd(codepoint);
2949 break;
2950 case XML_REGEXP_NUMBER_LETTER:
2951 ret = xmlUCSIsCatNl(codepoint);
2952 break;
2953 case XML_REGEXP_NUMBER_OTHERS:
2954 ret = xmlUCSIsCatNo(codepoint);
2955 break;
2956 case XML_REGEXP_PUNCT:
2957 ret = xmlUCSIsCatP(codepoint);
2958 break;
2959 case XML_REGEXP_PUNCT_CONNECTOR:
2960 ret = xmlUCSIsCatPc(codepoint);
2961 break;
2962 case XML_REGEXP_PUNCT_DASH:
2963 ret = xmlUCSIsCatPd(codepoint);
2964 break;
2965 case XML_REGEXP_PUNCT_OPEN:
2966 ret = xmlUCSIsCatPs(codepoint);
2967 break;
2968 case XML_REGEXP_PUNCT_CLOSE:
2969 ret = xmlUCSIsCatPe(codepoint);
2970 break;
2971 case XML_REGEXP_PUNCT_INITQUOTE:
2972 ret = xmlUCSIsCatPi(codepoint);
2973 break;
2974 case XML_REGEXP_PUNCT_FINQUOTE:
2975 ret = xmlUCSIsCatPf(codepoint);
2976 break;
2977 case XML_REGEXP_PUNCT_OTHERS:
2978 ret = xmlUCSIsCatPo(codepoint);
2979 break;
2980 case XML_REGEXP_SEPAR:
2981 ret = xmlUCSIsCatZ(codepoint);
2982 break;
2983 case XML_REGEXP_SEPAR_SPACE:
2984 ret = xmlUCSIsCatZs(codepoint);
2985 break;
2986 case XML_REGEXP_SEPAR_LINE:
2987 ret = xmlUCSIsCatZl(codepoint);
2988 break;
2989 case XML_REGEXP_SEPAR_PARA:
2990 ret = xmlUCSIsCatZp(codepoint);
2991 break;
2992 case XML_REGEXP_SYMBOL:
2993 ret = xmlUCSIsCatS(codepoint);
2994 break;
2995 case XML_REGEXP_SYMBOL_MATH:
2996 ret = xmlUCSIsCatSm(codepoint);
2997 break;
2998 case XML_REGEXP_SYMBOL_CURRENCY:
2999 ret = xmlUCSIsCatSc(codepoint);
3000 break;
3001 case XML_REGEXP_SYMBOL_MODIFIER:
3002 ret = xmlUCSIsCatSk(codepoint);
3003 break;
3004 case XML_REGEXP_SYMBOL_OTHERS:
3005 ret = xmlUCSIsCatSo(codepoint);
3006 break;
3007 case XML_REGEXP_OTHER:
3008 ret = xmlUCSIsCatC(codepoint);
3009 break;
3010 case XML_REGEXP_OTHER_CONTROL:
3011 ret = xmlUCSIsCatCc(codepoint);
3012 break;
3013 case XML_REGEXP_OTHER_FORMAT:
3014 ret = xmlUCSIsCatCf(codepoint);
3015 break;
3016 case XML_REGEXP_OTHER_PRIVATE:
3017 ret = xmlUCSIsCatCo(codepoint);
3018 break;
3019 case XML_REGEXP_OTHER_NA:
3020 /* ret = xmlUCSIsCatCn(codepoint); */
3021 /* Seems it doesn't exist anymore in recent Unicode releases */
3022 ret = 0;
3023 break;
3024 case XML_REGEXP_BLOCK_NAME:
3025 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
3026 break;
3027 }
3028 if (neg)
3029 return(!ret);
3030 return(ret);
3031}
3032
3033static int
3034xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
3035 int i, ret = 0;
3036 xmlRegRangePtr range;
3037
3038 if ((atom == NULL) || (!IS_CHAR(codepoint)))
3039 return(-1);
3040
3041 switch (atom->type) {
3042 case XML_REGEXP_SUBREG:
3043 case XML_REGEXP_EPSILON:
3044 return(-1);
3045 case XML_REGEXP_CHARVAL:
3046 return(codepoint == atom->codepoint);
3047 case XML_REGEXP_RANGES: {
3048 int accept = 0;
3049
3050 for (i = 0;i < atom->nbRanges;i++) {
3051 range = atom->ranges[i];
3052 if (range->neg == 2) {
3053 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3054 0, range->start, range->end,
3055 range->blockName);
3056 if (ret != 0)
3057 return(0); /* excluded char */
3058 } else if (range->neg) {
3059 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3060 0, range->start, range->end,
3061 range->blockName);
3062 if (ret == 0)
3063 accept = 1;
3064 else
3065 return(0);
3066 } else {
3067 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3068 0, range->start, range->end,
3069 range->blockName);
3070 if (ret != 0)
3071 accept = 1; /* might still be excluded */
3072 }
3073 }
3074 return(accept);
3075 }
3076 case XML_REGEXP_STRING:
3077 printf("TODO: XML_REGEXP_STRING\n");
3078 return(-1);
3079 case XML_REGEXP_ANYCHAR:
3080 case XML_REGEXP_ANYSPACE:
3081 case XML_REGEXP_NOTSPACE:
3082 case XML_REGEXP_INITNAME:
3083 case XML_REGEXP_NOTINITNAME:
3084 case XML_REGEXP_NAMECHAR:
3085 case XML_REGEXP_NOTNAMECHAR:
3086 case XML_REGEXP_DECIMAL:
3087 case XML_REGEXP_NOTDECIMAL:
3088 case XML_REGEXP_REALCHAR:
3089 case XML_REGEXP_NOTREALCHAR:
3090 case XML_REGEXP_LETTER:
3091 case XML_REGEXP_LETTER_UPPERCASE:
3092 case XML_REGEXP_LETTER_LOWERCASE:
3093 case XML_REGEXP_LETTER_TITLECASE:
3094 case XML_REGEXP_LETTER_MODIFIER:
3095 case XML_REGEXP_LETTER_OTHERS:
3096 case XML_REGEXP_MARK:
3097 case XML_REGEXP_MARK_NONSPACING:
3098 case XML_REGEXP_MARK_SPACECOMBINING:
3099 case XML_REGEXP_MARK_ENCLOSING:
3100 case XML_REGEXP_NUMBER:
3101 case XML_REGEXP_NUMBER_DECIMAL:
3102 case XML_REGEXP_NUMBER_LETTER:
3103 case XML_REGEXP_NUMBER_OTHERS:
3104 case XML_REGEXP_PUNCT:
3105 case XML_REGEXP_PUNCT_CONNECTOR:
3106 case XML_REGEXP_PUNCT_DASH:
3107 case XML_REGEXP_PUNCT_OPEN:
3108 case XML_REGEXP_PUNCT_CLOSE:
3109 case XML_REGEXP_PUNCT_INITQUOTE:
3110 case XML_REGEXP_PUNCT_FINQUOTE:
3111 case XML_REGEXP_PUNCT_OTHERS:
3112 case XML_REGEXP_SEPAR:
3113 case XML_REGEXP_SEPAR_SPACE:
3114 case XML_REGEXP_SEPAR_LINE:
3115 case XML_REGEXP_SEPAR_PARA:
3116 case XML_REGEXP_SYMBOL:
3117 case XML_REGEXP_SYMBOL_MATH:
3118 case XML_REGEXP_SYMBOL_CURRENCY:
3119 case XML_REGEXP_SYMBOL_MODIFIER:
3120 case XML_REGEXP_SYMBOL_OTHERS:
3121 case XML_REGEXP_OTHER:
3122 case XML_REGEXP_OTHER_CONTROL:
3123 case XML_REGEXP_OTHER_FORMAT:
3124 case XML_REGEXP_OTHER_PRIVATE:
3125 case XML_REGEXP_OTHER_NA:
3126 case XML_REGEXP_BLOCK_NAME:
3127 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3128 (const xmlChar *)atom->valuep);
3129 if (atom->neg)
3130 ret = !ret;
3131 break;
3132 }
3133 return(ret);
3134}
3135
3136/************************************************************************
3137 * *
3138 * Saving and restoring state of an execution context *
3139 * *
3140 ************************************************************************/
3141
3142#ifdef DEBUG_REGEXP_EXEC
3143static void
3144xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3145 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3146 if (exec->inputStack != NULL) {
3147 int i;
3148 printf(": ");
3149 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
3150 printf("%s ", (const char *)
3151 exec->inputStack[exec->inputStackNr - (i + 1)].value);
3152 } else {
3153 printf(": %s", &(exec->inputString[exec->index]));
3154 }
3155 printf("\n");
3156}
3157#endif
3158
3159static void
3160xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3161#ifdef DEBUG_REGEXP_EXEC
3162 printf("saving ");
3163 exec->transno++;
3164 xmlFARegDebugExec(exec);
3165 exec->transno--;
3166#endif
3167#ifdef MAX_PUSH
3168 if (exec->nbPush > MAX_PUSH) {
3169 return;
3170 }
3171 exec->nbPush++;
3172#endif
3173
3174 if (exec->maxRollbacks == 0) {
3175 exec->maxRollbacks = 4;
3176 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3177 sizeof(xmlRegExecRollback));
3178 if (exec->rollbacks == NULL) {
3179 xmlRegexpErrMemory(NULL, "saving regexp");
3180 exec->maxRollbacks = 0;
3181 return;
3182 }
3183 memset(exec->rollbacks, 0,
3184 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3185 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3186 xmlRegExecRollback *tmp;
3187 int len = exec->maxRollbacks;
3188
3189 exec->maxRollbacks *= 2;
3190 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3191 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3192 if (tmp == NULL) {
3193 xmlRegexpErrMemory(NULL, "saving regexp");
3194 exec->maxRollbacks /= 2;
3195 return;
3196 }
3197 exec->rollbacks = tmp;
3198 tmp = &exec->rollbacks[len];
3199 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3200 }
3201 exec->rollbacks[exec->nbRollbacks].state = exec->state;
3202 exec->rollbacks[exec->nbRollbacks].index = exec->index;
3203 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3204 if (exec->comp->nbCounters > 0) {
3205 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3206 exec->rollbacks[exec->nbRollbacks].counts = (int *)
3207 xmlMalloc(exec->comp->nbCounters * sizeof(int));
3208 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3209 xmlRegexpErrMemory(NULL, "saving regexp");
3210 exec->status = -5;
3211 return;
3212 }
3213 }
3214 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3215 exec->comp->nbCounters * sizeof(int));
3216 }
3217 exec->nbRollbacks++;
3218}
3219
3220static void
3221xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3222 if (exec->nbRollbacks <= 0) {
3223 exec->status = -1;
3224#ifdef DEBUG_REGEXP_EXEC
3225 printf("rollback failed on empty stack\n");
3226#endif
3227 return;
3228 }
3229 exec->nbRollbacks--;
3230 exec->state = exec->rollbacks[exec->nbRollbacks].state;
3231 exec->index = exec->rollbacks[exec->nbRollbacks].index;
3232 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3233 if (exec->comp->nbCounters > 0) {
3234 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3235 fprintf(stderr, "exec save: allocation failed");
3236 exec->status = -6;
3237 return;
3238 }
3239 if (exec->counts) {
3240 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3241 exec->comp->nbCounters * sizeof(int));
3242 }
3243 }
3244
3245#ifdef DEBUG_REGEXP_EXEC
3246 printf("restored ");
3247 xmlFARegDebugExec(exec);
3248#endif
3249}
3250
3251/************************************************************************
3252 * *
3253 * Verifier, running an input against a compiled regexp *
3254 * *
3255 ************************************************************************/
3256
3257static int
3258xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3259 xmlRegExecCtxt execval;
3260 xmlRegExecCtxtPtr exec = &execval;
3261 int ret, codepoint = 0, len, deter;
3262
3263 exec->inputString = content;
3264 exec->index = 0;
3265 exec->nbPush = 0;
3266 exec->determinist = 1;
3267 exec->maxRollbacks = 0;
3268 exec->nbRollbacks = 0;
3269 exec->rollbacks = NULL;
3270 exec->status = 0;
3271 exec->comp = comp;
3272 exec->state = comp->states[0];
3273 exec->transno = 0;
3274 exec->transcount = 0;
3275 exec->inputStack = NULL;
3276 exec->inputStackMax = 0;
3277 if (comp->nbCounters > 0) {
3278 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3279 if (exec->counts == NULL) {
3280 xmlRegexpErrMemory(NULL, "running regexp");
3281 return(-1);
3282 }
3283 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3284 } else
3285 exec->counts = NULL;
3286 while ((exec->status == 0) && (exec->state != NULL) &&
3287 ((exec->inputString[exec->index] != 0) ||
3288 ((exec->state != NULL) &&
3289 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3290 xmlRegTransPtr trans;
3291 xmlRegAtomPtr atom;
3292
3293 /*
3294 * If end of input on non-terminal state, rollback, however we may
3295 * still have epsilon like transition for counted transitions
3296 * on counters, in that case don't break too early. Additionally,
3297 * if we are working on a range like "AB{0,2}", where B is not present,
3298 * we don't want to break.
3299 */
3300 len = 1;
3301 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3302 /*
3303 * if there is a transition, we must check if
3304 * atom allows minOccurs of 0
3305 */
3306 if (exec->transno < exec->state->nbTrans) {
3307 trans = &exec->state->trans[exec->transno];
3308 if (trans->to >=0) {
3309 atom = trans->atom;
3310 if (!((atom->min == 0) && (atom->max > 0)))
3311 goto rollback;
3312 }
3313 } else
3314 goto rollback;
3315 }
3316
3317 exec->transcount = 0;
3318 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3319 trans = &exec->state->trans[exec->transno];
3320 if (trans->to < 0)
3321 continue;
3322 atom = trans->atom;
3323 ret = 0;
3324 deter = 1;
3325 if (trans->count >= 0) {
3326 int count;
3327 xmlRegCounterPtr counter;
3328
3329 if (exec->counts == NULL) {
3330 exec->status = -1;
3331 goto error;
3332 }
3333 /*
3334 * A counted transition.
3335 */
3336
3337 count = exec->counts[trans->count];
3338 counter = &exec->comp->counters[trans->count];
3339#ifdef DEBUG_REGEXP_EXEC
3340 printf("testing count %d: val %d, min %d, max %d\n",
3341 trans->count, count, counter->min, counter->max);
3342#endif
3343 ret = ((count >= counter->min) && (count <= counter->max));
3344 if ((ret) && (counter->min != counter->max))
3345 deter = 0;
3346 } else if (atom == NULL) {
3347 fprintf(stderr, "epsilon transition left at runtime\n");
3348 exec->status = -2;
3349 break;
3350 } else if (exec->inputString[exec->index] != 0) {
3351 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3352 ret = xmlRegCheckCharacter(atom, codepoint);
3353 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3354 xmlRegStatePtr to = comp->states[trans->to];
3355
3356 /*
3357 * this is a multiple input sequence
3358 * If there is a counter associated increment it now.
3359 * do not increment if the counter is already over the
3360 * maximum limit in which case get to next transition
3361 */
3362 if (trans->counter >= 0) {
3363 xmlRegCounterPtr counter;
3364
3365 if ((exec->counts == NULL) ||
3366 (exec->comp == NULL) ||
3367 (exec->comp->counters == NULL)) {
3368 exec->status = -1;
3369 goto error;
3370 }
3371 counter = &exec->comp->counters[trans->counter];
3372 if (exec->counts[trans->counter] >= counter->max)
3373 continue; /* for loop on transitions */
3374 }
3375 /* Save before incrementing */
3376 if (exec->state->nbTrans > exec->transno + 1) {
3377 xmlFARegExecSave(exec);
3378 }
3379 if (trans->counter >= 0) {
3380#ifdef DEBUG_REGEXP_EXEC
3381 printf("Increasing count %d\n", trans->counter);
3382#endif
3383 exec->counts[trans->counter]++;
3384 }
3385 exec->transcount = 1;
3386 do {
3387 /*
3388 * Try to progress as much as possible on the input
3389 */
3390 if (exec->transcount == atom->max) {
3391 break;
3392 }
3393 exec->index += len;
3394 /*
3395 * End of input: stop here
3396 */
3397 if (exec->inputString[exec->index] == 0) {
3398 exec->index -= len;
3399 break;
3400 }
3401 if (exec->transcount >= atom->min) {
3402 int transno = exec->transno;
3403 xmlRegStatePtr state = exec->state;
3404
3405 /*
3406 * The transition is acceptable save it
3407 */
3408 exec->transno = -1; /* trick */
3409 exec->state = to;
3410 xmlFARegExecSave(exec);
3411 exec->transno = transno;
3412 exec->state = state;
3413 }
3414 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3415 len);
3416 ret = xmlRegCheckCharacter(atom, codepoint);
3417 exec->transcount++;
3418 } while (ret == 1);
3419 if (exec->transcount < atom->min)
3420 ret = 0;
3421
3422 /*
3423 * If the last check failed but one transition was found
3424 * possible, rollback
3425 */
3426 if (ret < 0)
3427 ret = 0;
3428 if (ret == 0) {
3429 goto rollback;
3430 }
3431 if (trans->counter >= 0) {
3432 if (exec->counts == NULL) {
3433 exec->status = -1;
3434 goto error;
3435 }
3436#ifdef DEBUG_REGEXP_EXEC
3437 printf("Decreasing count %d\n", trans->counter);
3438#endif
3439 exec->counts[trans->counter]--;
3440 }
3441 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3442 /*
3443 * we don't match on the codepoint, but minOccurs of 0
3444 * says that's ok. Setting len to 0 inhibits stepping
3445 * over the codepoint.
3446 */
3447 exec->transcount = 1;
3448 len = 0;
3449 ret = 1;
3450 }
3451 } else if ((atom->min == 0) && (atom->max > 0)) {
3452 /* another spot to match when minOccurs is 0 */
3453 exec->transcount = 1;
3454 len = 0;
3455 ret = 1;
3456 }
3457 if (ret == 1) {
3458 if ((trans->nd == 1) ||
3459 ((trans->count >= 0) && (deter == 0) &&
3460 (exec->state->nbTrans > exec->transno + 1))) {
3461#ifdef DEBUG_REGEXP_EXEC
3462 if (trans->nd == 1)
3463 printf("Saving on nd transition atom %d for %c at %d\n",
3464 trans->atom->no, codepoint, exec->index);
3465 else
3466 printf("Saving on counted transition count %d for %c at %d\n",
3467 trans->count, codepoint, exec->index);
3468#endif
3469 xmlFARegExecSave(exec);
3470 }
3471 if (trans->counter >= 0) {
3472 xmlRegCounterPtr counter;
3473
3474 /* make sure we don't go over the counter maximum value */
3475 if ((exec->counts == NULL) ||
3476 (exec->comp == NULL) ||
3477 (exec->comp->counters == NULL)) {
3478 exec->status = -1;
3479 goto error;
3480 }
3481 counter = &exec->comp->counters[trans->counter];
3482 if (exec->counts[trans->counter] >= counter->max)
3483 continue; /* for loop on transitions */
3484#ifdef DEBUG_REGEXP_EXEC
3485 printf("Increasing count %d\n", trans->counter);
3486#endif
3487 exec->counts[trans->counter]++;
3488 }
3489 if ((trans->count >= 0) &&
3490 (trans->count < REGEXP_ALL_COUNTER)) {
3491 if (exec->counts == NULL) {
3492 exec->status = -1;
3493 goto error;
3494 }
3495#ifdef DEBUG_REGEXP_EXEC
3496 printf("resetting count %d on transition\n",
3497 trans->count);
3498#endif
3499 exec->counts[trans->count] = 0;
3500 }
3501#ifdef DEBUG_REGEXP_EXEC
3502 printf("entering state %d\n", trans->to);
3503#endif
3504 exec->state = comp->states[trans->to];
3505 exec->transno = 0;
3506 if (trans->atom != NULL) {
3507 exec->index += len;
3508 }
3509 goto progress;
3510 } else if (ret < 0) {
3511 exec->status = -4;
3512 break;
3513 }
3514 }
3515 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3516rollback:
3517 /*
3518 * Failed to find a way out
3519 */
3520 exec->determinist = 0;
3521#ifdef DEBUG_REGEXP_EXEC
3522 printf("rollback from state %d on %d:%c\n", exec->state->no,
3523 codepoint,codepoint);
3524#endif
3525 xmlFARegExecRollBack(exec);
3526 }
3527progress:
3528 continue;
3529 }
3530error:
3531 if (exec->rollbacks != NULL) {
3532 if (exec->counts != NULL) {
3533 int i;
3534
3535 for (i = 0;i < exec->maxRollbacks;i++)
3536 if (exec->rollbacks[i].counts != NULL)
3537 xmlFree(exec->rollbacks[i].counts);
3538 }
3539 xmlFree(exec->rollbacks);
3540 }
3541 if (exec->state == NULL)
3542 return(-1);
3543 if (exec->counts != NULL)
3544 xmlFree(exec->counts);
3545 if (exec->status == 0)
3546 return(1);
3547 if (exec->status == -1) {
3548 if (exec->nbPush > MAX_PUSH)
3549 return(-1);
3550 return(0);
3551 }
3552 return(exec->status);
3553}
3554
3555/************************************************************************
3556 * *
3557 * Progressive interface to the verifier one atom at a time *
3558 * *
3559 ************************************************************************/
3560#ifdef DEBUG_ERR
3561static void testerr(xmlRegExecCtxtPtr exec);
3562#endif
3563
3575xmlRegExecCtxtPtr
3576xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3577 xmlRegExecCtxtPtr exec;
3578
3579 if (comp == NULL)
3580 return(NULL);
3581 if ((comp->compact == NULL) && (comp->states == NULL))
3582 return(NULL);
3583 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3584 if (exec == NULL) {
3585 xmlRegexpErrMemory(NULL, "creating execution context");
3586 return(NULL);
3587 }
3588 memset(exec, 0, sizeof(xmlRegExecCtxt));
3589 exec->inputString = NULL;
3590 exec->index = 0;
3591 exec->determinist = 1;
3592 exec->maxRollbacks = 0;
3593 exec->nbRollbacks = 0;
3594 exec->rollbacks = NULL;
3595 exec->status = 0;
3596 exec->comp = comp;
3597 if (comp->compact == NULL)
3598 exec->state = comp->states[0];
3599 exec->transno = 0;
3600 exec->transcount = 0;
3601 exec->callback = callback;
3602 exec->data = data;
3603 if (comp->nbCounters > 0) {
3604 /*
3605 * For error handling, exec->counts is allocated twice the size
3606 * the second half is used to store the data in case of rollback
3607 */
3608 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3609 * 2);
3610 if (exec->counts == NULL) {
3611 xmlRegexpErrMemory(NULL, "creating execution context");
3612 xmlFree(exec);
3613 return(NULL);
3614 }
3615 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3616 exec->errCounts = &exec->counts[comp->nbCounters];
3617 } else {
3618 exec->counts = NULL;
3619 exec->errCounts = NULL;
3620 }
3621 exec->inputStackMax = 0;
3622 exec->inputStackNr = 0;
3623 exec->inputStack = NULL;
3624 exec->errStateNo = -1;
3625 exec->errString = NULL;
3626 exec->nbPush = 0;
3627 return(exec);
3628}
3629
3636void
3637xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3638 if (exec == NULL)
3639 return;
3640
3641 if (exec->rollbacks != NULL) {
3642 if (exec->counts != NULL) {
3643 int i;
3644
3645 for (i = 0;i < exec->maxRollbacks;i++)
3646 if (exec->rollbacks[i].counts != NULL)
3647 xmlFree(exec->rollbacks[i].counts);
3648 }
3649 xmlFree(exec->rollbacks);
3650 }
3651 if (exec->counts != NULL)
3652 xmlFree(exec->counts);
3653 if (exec->inputStack != NULL) {
3654 int i;
3655
3656 for (i = 0;i < exec->inputStackNr;i++) {
3657 if (exec->inputStack[i].value != NULL)
3658 xmlFree(exec->inputStack[i].value);
3659 }
3660 xmlFree(exec->inputStack);
3661 }
3662 if (exec->errString != NULL)
3663 xmlFree(exec->errString);
3664 xmlFree(exec);
3665}
3666
3667static void
3668xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3669 void *data) {
3670#ifdef DEBUG_PUSH
3671 printf("saving value: %d:%s\n", exec->inputStackNr, value);
3672#endif
3673 if (exec->inputStackMax == 0) {
3674 exec->inputStackMax = 4;
3675 exec->inputStack = (xmlRegInputTokenPtr)
3676 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3677 if (exec->inputStack == NULL) {
3678 xmlRegexpErrMemory(NULL, "pushing input string");
3679 exec->inputStackMax = 0;
3680 return;
3681 }
3682 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3683 xmlRegInputTokenPtr tmp;
3684
3685 exec->inputStackMax *= 2;
3686 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3687 exec->inputStackMax * sizeof(xmlRegInputToken));
3688 if (tmp == NULL) {
3689 xmlRegexpErrMemory(NULL, "pushing input string");
3690 exec->inputStackMax /= 2;
3691 return;
3692 }
3693 exec->inputStack = tmp;
3694 }
3695 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3696 exec->inputStack[exec->inputStackNr].data = data;
3697 exec->inputStackNr++;
3698 exec->inputStack[exec->inputStackNr].value = NULL;
3699 exec->inputStack[exec->inputStackNr].data = NULL;
3700}
3701
3715static int
3716xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3717 if (expStr == valStr) return(1);
3718 if (expStr == NULL) return(0);
3719 if (valStr == NULL) return(0);
3720 do {
3721 /*
3722 * Eval if we have a wildcard for the current item.
3723 */
3724 if (*expStr != *valStr) {
3725 /* if one of them starts with a wildcard make valStr be it */
3726 if (*valStr == '*') {
3727 const xmlChar *tmp;
3728
3729 tmp = valStr;
3730 valStr = expStr;
3731 expStr = tmp;
3732 }
3733 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3734 do {
3735 if (*valStr == XML_REG_STRING_SEPARATOR)
3736 break;
3737 valStr++;
3738 } while (*valStr != 0);
3739 continue;
3740 } else
3741 return(0);
3742 }
3743 expStr++;
3744 valStr++;
3745 } while (*valStr != 0);
3746 if (*expStr != 0)
3747 return (0);
3748 else
3749 return (1);
3750}
3751
3764static int
3765xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3766 xmlRegexpPtr comp,
3767 const xmlChar *value,
3768 void *data) {
3769 int state = exec->index;
3770 int i, target;
3771
3772 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3773 return(-1);
3774
3775 if (value == NULL) {
3776 /*
3777 * are we at a final state ?
3778 */
3779 if (comp->compact[state * (comp->nbstrings + 1)] ==
3780 XML_REGEXP_FINAL_STATE)
3781 return(1);
3782 return(0);
3783 }
3784
3785#ifdef DEBUG_PUSH
3786 printf("value pushed: %s\n", value);
3787#endif
3788
3789 /*
3790 * Examine all outside transitions from current state
3791 */
3792 for (i = 0;i < comp->nbstrings;i++) {
3793 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3794 if ((target > 0) && (target <= comp->nbstates)) {
3795 target--; /* to avoid 0 */
3796 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3797 exec->index = target;
3798 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3799 exec->callback(exec->data, value,
3800 comp->transdata[state * comp->nbstrings + i], data);
3801 }
3802#ifdef DEBUG_PUSH
3803 printf("entering state %d\n", target);
3804#endif
3805 if (comp->compact[target * (comp->nbstrings + 1)] ==
3806 XML_REGEXP_SINK_STATE)
3807 goto error;
3808
3809 if (comp->compact[target * (comp->nbstrings + 1)] ==
3810 XML_REGEXP_FINAL_STATE)
3811 return(1);
3812 return(0);
3813 }
3814 }
3815 }
3816 /*
3817 * Failed to find an exit transition out from current state for the
3818 * current token
3819 */
3820#ifdef DEBUG_PUSH
3821 printf("failed to find a transition for %s on state %d\n", value, state);
3822#endif
3823error:
3824 if (exec->errString != NULL)
3825 xmlFree(exec->errString);
3826 exec->errString = xmlStrdup(value);
3827 exec->errStateNo = state;
3828 exec->status = -1;
3829#ifdef DEBUG_ERR
3830 testerr(exec);
3831#endif
3832 return(-1);
3833}
3834
3847static int
3848xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3849 void *data, int compound) {
3850 xmlRegTransPtr trans;
3851 xmlRegAtomPtr atom;
3852 int ret;
3853 int final = 0;
3854 int progress = 1;
3855
3856 if (exec == NULL)
3857 return(-1);
3858 if (exec->comp == NULL)
3859 return(-1);
3860 if (exec->status != 0)
3861 return(exec->status);
3862
3863 if (exec->comp->compact != NULL)
3864 return(xmlRegCompactPushString(exec, exec->comp, value, data));
3865
3866 if (value == NULL) {
3867 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3868 return(1);
3869 final = 1;
3870 }
3871
3872#ifdef DEBUG_PUSH
3873 printf("value pushed: %s\n", value);
3874#endif
3875 /*
3876 * If we have an active rollback stack push the new value there
3877 * and get back to where we were left
3878 */
3879 if ((value != NULL) && (exec->inputStackNr > 0)) {
3880 xmlFARegExecSaveInputString(exec, value, data);
3881 value = exec->inputStack[exec->index].value;
3882 data = exec->inputStack[exec->index].data;
3883#ifdef DEBUG_PUSH
3884 printf("value loaded: %s\n", value);
3885#endif
3886 }
3887
3888 while ((exec->status == 0) &&
3889 ((value != NULL) ||
3890 ((final == 1) &&
3891 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3892
3893 /*
3894 * End of input on non-terminal state, rollback, however we may
3895 * still have epsilon like transition for counted transitions
3896 * on counters, in that case don't break too early.
3897 */
3898 if ((value == NULL) && (exec->counts == NULL))
3899 goto rollback;
3900
3901 exec->transcount = 0;
3902 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3903 trans = &exec->state->trans[exec->transno];
3904 if (trans->to < 0)
3905 continue;
3906 atom = trans->atom;
3907 ret = 0;
3908 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3909 int i;
3910 int count;
3911 xmlRegTransPtr t;
3912 xmlRegCounterPtr counter;
3913
3914 ret = 0;
3915
3916#ifdef DEBUG_PUSH
3917 printf("testing all lax %d\n", trans->count);
3918#endif
3919 /*
3920 * Check all counted transitions from the current state
3921 */
3922 if ((value == NULL) && (final)) {
3923 ret = 1;
3924 } else if (value != NULL) {
3925 for (i = 0;i < exec->state->nbTrans;i++) {
3926 t = &exec->state->trans[i];
3927 if ((t->counter < 0) || (t == trans))
3928 continue;
3929 counter = &exec->comp->counters[t->counter];
3930 count = exec->counts[t->counter];
3931 if ((count < counter->max) &&
3932 (t->atom != NULL) &&
3933 (xmlStrEqual(value, t->atom->valuep))) {
3934 ret = 0;
3935 break;
3936 }
3937 if ((count >= counter->min) &&
3938 (count < counter->max) &&
3939 (t->atom != NULL) &&
3940 (xmlStrEqual(value, t->atom->valuep))) {
3941 ret = 1;
3942 break;
3943 }
3944 }
3945 }
3946 } else if (trans->count == REGEXP_ALL_COUNTER) {
3947 int i;
3948 int count;
3949 xmlRegTransPtr t;
3950 xmlRegCounterPtr counter;
3951
3952 ret = 1;
3953
3954#ifdef DEBUG_PUSH
3955 printf("testing all %d\n", trans->count);
3956#endif
3957 /*
3958 * Check all counted transitions from the current state
3959 */
3960 for (i = 0;i < exec->state->nbTrans;i++) {
3961 t = &exec->state->trans[i];
3962 if ((t->counter < 0) || (t == trans))
3963 continue;
3964 counter = &exec->comp->counters[t->counter];
3965 count = exec->counts[t->counter];
3966 if ((count < counter->min) || (count > counter->max)) {
3967 ret = 0;
3968 break;
3969 }
3970 }
3971 } else if (trans->count >= 0) {
3972 int count;
3973 xmlRegCounterPtr counter;
3974
3975 /*
3976 * A counted transition.
3977 */
3978
3979 count = exec->counts[trans->count];
3980 counter = &exec->comp->counters[trans->count];
3981#ifdef DEBUG_PUSH
3982 printf("testing count %d: val %d, min %d, max %d\n",
3983 trans->count, count, counter->min, counter->max);
3984#endif
3985 ret = ((count >= counter->min) && (count <= counter->max));
3986 } else if (atom == NULL) {
3987 fprintf(stderr, "epsilon transition left at runtime\n");
3988 exec->status = -2;
3989 break;
3990 } else if (value != NULL) {
3991 ret = xmlRegStrEqualWildcard(atom->valuep, value);
3992 if (atom->neg) {
3993 ret = !ret;
3994 if (!compound)
3995 ret = 0;
3996 }
3997 if ((ret == 1) && (trans->counter >= 0)) {
3998 xmlRegCounterPtr counter;
3999 int count;
4000
4001 count = exec->counts[trans->counter];
4002 counter = &exec->comp->counters[trans->counter];
4003 if (count >= counter->max)
4004 ret = 0;
4005 }
4006
4007 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4008 xmlRegStatePtr to = exec->comp->states[trans->to];
4009
4010 /*
4011 * this is a multiple input sequence
4012 */
4013 if (exec->state->nbTrans > exec->transno + 1) {
4014 if (exec->inputStackNr <= 0) {
4015 xmlFARegExecSaveInputString(exec, value, data);
4016 }
4017 xmlFARegExecSave(exec);
4018 }
4019 exec->transcount = 1;
4020 do {
4021 /*
4022 * Try to progress as much as possible on the input
4023 */
4024 if (exec->transcount == atom->max) {
4025 break;
4026 }
4027 exec->index++;
4028 value = exec->inputStack[exec->index].value;
4029 data = exec->inputStack[exec->index].data;
4030#ifdef DEBUG_PUSH
4031 printf("value loaded: %s\n", value);
4032#endif
4033
4034 /*
4035 * End of input: stop here
4036 */
4037 if (value == NULL) {
4038 exec->index --;
4039 break;
4040 }
4041 if (exec->transcount >= atom->min) {
4042 int transno = exec->transno;
4043 xmlRegStatePtr state = exec->state;
4044
4045 /*
4046 * The transition is acceptable save it
4047 */
4048 exec->transno = -1; /* trick */
4049 exec->state = to;
4050 if (exec->inputStackNr <= 0) {
4051 xmlFARegExecSaveInputString(exec, value, data);
4052 }
4053 xmlFARegExecSave(exec);
4054 exec->transno = transno;
4055 exec->state = state;
4056 }
4057 ret = xmlStrEqual(value, atom->valuep);
4058 exec->transcount++;
4059 } while (ret == 1);
4060 if (exec->transcount < atom->min)
4061 ret = 0;
4062
4063 /*
4064 * If the last check failed but one transition was found
4065 * possible, rollback
4066 */
4067 if (ret < 0)
4068 ret = 0;
4069 if (ret == 0) {
4070 goto rollback;
4071 }
4072 }
4073 }
4074 if (ret == 1) {
4075 if ((exec->callback != NULL) && (atom != NULL) &&
4076 (data != NULL)) {
4077 exec->callback(exec->data, atom->valuep,
4078 atom->data, data);
4079 }
4080 if (exec->state->nbTrans > exec->transno + 1) {
4081 if (exec->inputStackNr <= 0) {
4082 xmlFARegExecSaveInputString(exec, value, data);
4083 }
4084 xmlFARegExecSave(exec);
4085 }
4086 if (trans->counter >= 0) {
4087#ifdef DEBUG_PUSH
4088 printf("Increasing count %d\n", trans->counter);
4089#endif
4090 exec->counts[trans->counter]++;
4091 }
4092 if ((trans->count >= 0) &&
4093 (trans->count < REGEXP_ALL_COUNTER)) {
4094#ifdef DEBUG_REGEXP_EXEC
4095 printf("resetting count %d on transition\n",
4096 trans->count);
4097#endif
4098 exec->counts[trans->count] = 0;
4099 }
4100#ifdef DEBUG_PUSH
4101 printf("entering state %d\n", trans->to);
4102#endif
4103 if ((exec->comp->states[trans->to] != NULL) &&
4104 (exec->comp->states[trans->to]->type ==
4105 XML_REGEXP_SINK_STATE)) {
4106 /*
4107 * entering a sink state, save the current state as error
4108 * state.
4109 */
4110 if (exec->errString != NULL)
4111 xmlFree(exec->errString);
4112 exec->errString = xmlStrdup(value);
4113 exec->errState = exec->state;
4114 memcpy(exec->errCounts, exec->counts,
4115 exec->comp->nbCounters * sizeof(int));
4116 }
4117 exec->state = exec->comp->states[trans->to];
4118 exec->transno = 0;
4119 if (trans->atom != NULL) {
4120 if (exec->inputStack != NULL) {
4121 exec->index++;
4122 if (exec->index < exec->inputStackNr) {
4123 value = exec->inputStack[exec->index].value;
4124 data = exec->inputStack[exec->index].data;
4125#ifdef DEBUG_PUSH
4126 printf("value loaded: %s\n", value);
4127#endif
4128 } else {
4129 value = NULL;
4130 data = NULL;
4131#ifdef DEBUG_PUSH
4132 printf("end of input\n");
4133#endif
4134 }
4135 } else {
4136 value = NULL;
4137 data = NULL;
4138#ifdef DEBUG_PUSH
4139 printf("end of input\n");
4140#endif
4141 }
4142 }
4143 goto progress;
4144 } else if (ret < 0) {
4145 exec->status = -4;
4146 break;
4147 }
4148 }
4149 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4150rollback:
4151 /*
4152 * if we didn't yet rollback on the current input
4153 * store the current state as the error state.
4154 */
4155 if ((progress) && (exec->state != NULL) &&
4156 (exec->state->type != XML_REGEXP_SINK_STATE)) {
4157 progress = 0;
4158 if (exec->errString != NULL)
4159 xmlFree(exec->errString);
4160 exec->errString = xmlStrdup(value);
4161 exec->errState = exec->state;
4162 if (exec->comp->nbCounters)
4163 memcpy(exec->errCounts, exec->counts,
4164 exec->comp->nbCounters * sizeof(int));
4165 }
4166
4167 /*
4168 * Failed to find a way out
4169 */
4170 exec->determinist = 0;
4171 xmlFARegExecRollBack(exec);
4172 if ((exec->inputStack != NULL ) && (exec->status == 0)) {
4173 value = exec->inputStack[exec->index].value;
4174 data = exec->inputStack[exec->index].data;
4175#ifdef DEBUG_PUSH
4176 printf("value loaded: %s\n", value);
4177#endif
4178 }
4179 }
4180 continue;
4181progress:
4182 progress = 1;
4183 continue;
4184 }
4185 if (exec->status == 0) {
4186 return(exec->state->type == XML_REGEXP_FINAL_STATE);
4187 }
4188#ifdef DEBUG_ERR
4189 if (exec->status < 0) {
4190 testerr(exec);
4191 }
4192#endif
4193 return(exec->status);
4194}
4195
4207int
4208xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4209 void *data) {
4210 return(xmlRegExecPushStringInternal(exec, value, data, 0));
4211}
4212
4225int
4226xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4227 const xmlChar *value2, void *data) {
4228 xmlChar buf[150];
4229 int lenn, lenp, ret;
4230 xmlChar *str;
4231
4232 if (exec == NULL)
4233 return(-1);
4234 if (exec->comp == NULL)
4235 return(-1);
4236 if (exec->status != 0)
4237 return(exec->status);
4238
4239 if (value2 == NULL)
4240 return(xmlRegExecPushString(exec, value, data));
4241
4242 lenn = strlen((char *) value2);
4243 lenp = strlen((char *) value);
4244
4245 if (150 < lenn + lenp + 2) {
4246 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4247 if (str == NULL) {
4248 exec->status = -1;
4249 return(-1);
4250 }
4251 } else {
4252 str = buf;
4253 }
4254 memcpy(&str[0], value, lenp);
4255 str[lenp] = XML_REG_STRING_SEPARATOR;
4256 memcpy(&str[lenp + 1], value2, lenn);
4257 str[lenn + lenp + 1] = 0;
4258
4259 if (exec->comp->compact != NULL)
4260 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4261 else
4262 ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4263
4264 if (str != buf)
4265 xmlFree(str);
4266 return(ret);
4267}
4268
4283static int
4284xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4285 int *nbval, int *nbneg,
4286 xmlChar **values, int *terminal) {
4287 int maxval;
4288 int nb = 0;
4289
4290 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4291 (values == NULL) || (*nbval <= 0))
4292 return(-1);
4293
4294 maxval = *nbval;
4295 *nbval = 0;
4296 *nbneg = 0;
4297 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4298 xmlRegexpPtr comp;
4299 int target, i, state;
4300
4301 comp = exec->comp;
4302
4303 if (err) {
4304 if (exec->errStateNo == -1) return(-1);
4305 state = exec->errStateNo;
4306 } else {
4307 state = exec->index;
4308 }
4309 if (terminal != NULL) {
4310 if (comp->compact[state * (comp->nbstrings + 1)] ==
4311 XML_REGEXP_FINAL_STATE)
4312 *terminal = 1;
4313 else
4314 *terminal = 0;
4315 }
4316 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4317 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4318 if ((target > 0) && (target <= comp->nbstates) &&
4319 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4320 XML_REGEXP_SINK_STATE)) {
4321 values[nb++] = comp->stringMap[i];
4322 (*nbval)++;
4323 }
4324 }
4325 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4326 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4327 if ((target > 0) && (target <= comp->nbstates) &&
4328 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4329 XML_REGEXP_SINK_STATE)) {
4330 values[nb++] = comp->stringMap[i];
4331 (*nbneg)++;
4332 }
4333 }
4334 } else {
4335 int transno;
4336 xmlRegTransPtr trans;
4337 xmlRegAtomPtr atom;
4338 xmlRegStatePtr state;
4339
4340 if (terminal != NULL) {
4341 if (exec->state->type == XML_REGEXP_FINAL_STATE)
4342 *terminal = 1;
4343 else
4344 *terminal = 0;
4345 }
4346
4347 if (err) {
4348 if (exec->errState == NULL) return(-1);
4349 state = exec->errState;
4350 } else {
4351 if (exec->state == NULL) return(-1);
4352 state = exec->state;
4353 }
4354 for (transno = 0;
4355 (transno < state->nbTrans) && (nb < maxval);
4356 transno++) {
4357 trans = &state->trans[transno];
4358 if (trans->to < 0)
4359 continue;
4360 atom = trans->atom;
4361 if ((atom == NULL) || (atom->valuep == NULL))
4362 continue;
4363 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4364 /* this should not be reached but ... */
4365 TODO;
4366 } else if (trans->count == REGEXP_ALL_COUNTER) {
4367 /* this should not be reached but ... */
4368 TODO;
4369 } else if (trans->counter >= 0) {
4370 xmlRegCounterPtr counter = NULL;
4371 int count;
4372
4373 if (err)
4374 count = exec->errCounts[trans->counter];
4375 else
4376 count = exec->counts[trans->counter];
4377 if (exec->comp != NULL)
4378 counter = &exec->comp->counters[trans->counter];
4379 if ((counter == NULL) || (count < counter->max)) {
4380 if (atom->neg)
4381 values[nb++] = (xmlChar *) atom->valuep2;
4382 else
4383 values[nb++] = (xmlChar *) atom->valuep;
4384 (*nbval)++;
4385 }
4386 } else {
4387 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4388 (exec->comp->states[trans->to]->type !=
4389 XML_REGEXP_SINK_STATE)) {
4390 if (atom->neg)
4391 values[nb++] = (xmlChar *) atom->valuep2;
4392 else
4393 values[nb++] = (xmlChar *) atom->valuep;
4394 (*nbval)++;
4395 }
4396 }
4397 }
4398 for (transno = 0;
4399 (transno < state->nbTrans) && (nb < maxval);
4400 transno++) {
4401 trans = &state->trans[transno];
4402 if (trans->to < 0)
4403 continue;
4404 atom = trans->atom;
4405 if ((atom == NULL) || (atom->valuep == NULL))
4406 continue;
4407 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4408 continue;
4409 } else if (trans->count == REGEXP_ALL_COUNTER) {
4410 continue;
4411 } else if (trans->counter >= 0) {
4412 continue;
4413 } else {
4414 if ((exec->comp->states[trans->to] != NULL) &&
4415 (exec->comp->states[trans->to]->type ==
4416 XML_REGEXP_SINK_STATE)) {
4417 if (atom->neg)
4418 values[nb++] = (xmlChar *) atom->valuep2;
4419 else
4420 values[nb++] = (xmlChar *) atom->valuep;
4421 (*nbneg)++;
4422 }
4423 }
4424 }
4425 }
4426 return(0);
4427}
4428
4446int
4447xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4448 xmlChar **values, int *terminal) {
4449 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4450}
4451
4471int
4472xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4473 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4474 if (exec == NULL)
4475 return(-1);
4476 if (string != NULL) {
4477 if (exec->status != 0)
4478 *string = exec->errString;
4479 else
4480 *string = NULL;
4481 }
4482 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4483}
4484
4485#ifdef DEBUG_ERR
4486static void testerr(xmlRegExecCtxtPtr exec) {
4487 const xmlChar *string;
4488 xmlChar *values[5];
4489 int nb = 5;
4490 int nbneg;
4491 int terminal;
4492 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4493}
4494#endif
4495
4496#if 0
4497static int
4498xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4499 xmlRegTransPtr trans;
4500 xmlRegAtomPtr atom;
4501 int ret;
4502 int codepoint, len;
4503
4504 if (exec == NULL)
4505 return(-1);
4506 if (exec->status != 0)
4507 return(exec->status);
4508
4509 while ((exec->status == 0) &&
4510 ((exec->inputString[exec->index] != 0) ||
4511 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4512
4513 /*
4514 * End of input on non-terminal state, rollback, however we may
4515 * still have epsilon like transition for counted transitions
4516 * on counters, in that case don't break too early.
4517 */
4518 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4519 goto rollback;
4520
4521 exec->transcount = 0;
4522 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4523 trans = &exec->state->trans[exec->transno];
4524 if (trans->to < 0)
4525 continue;
4526 atom = trans->atom;
4527 ret = 0;
4528 if (trans->count >= 0) {
4529 int count;
4530 xmlRegCounterPtr counter;
4531
4532 /*
4533 * A counted transition.
4534 */
4535
4536 count = exec->counts[trans->count];
4537 counter = &exec->comp->counters[trans->count];
4538#ifdef DEBUG_REGEXP_EXEC
4539 printf("testing count %d: val %d, min %d, max %d\n",
4540 trans->count, count, counter->min, counter->max);
4541#endif
4542 ret = ((count >= counter->min) && (count <= counter->max));
4543 } else if (atom == NULL) {
4544 fprintf(stderr, "epsilon transition left at runtime\n");
4545 exec->status = -2;
4546 break;
4547 } else if (exec->inputString[exec->index] != 0) {
4548 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4549 ret = xmlRegCheckCharacter(atom, codepoint);
4550 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4551 xmlRegStatePtr to = exec->comp->states[trans->to];
4552
4553 /*
4554 * this is a multiple input sequence
4555 */
4556 if (exec->state->nbTrans > exec->transno + 1) {
4557 xmlFARegExecSave(exec);
4558 }
4559 exec->transcount = 1;
4560 do {
4561 /*
4562 * Try to progress as much as possible on the input
4563 */
4564 if (exec->transcount == atom->max) {
4565 break;
4566 }
4567 exec->index += len;
4568 /*
4569 * End of input: stop here
4570 */
4571 if (exec->inputString[exec->index] == 0) {
4572 exec->index -= len;
4573 break;
4574 }
4575 if (exec->transcount >= atom->min) {
4576 int transno = exec->transno;
4577 xmlRegStatePtr state = exec->state;
4578
4579 /*
4580 * The transition is acceptable save it
4581 */
4582 exec->transno = -1; /* trick */
4583 exec->state = to;
4584 xmlFARegExecSave(exec);
4585 exec->transno = transno;
4586 exec->state = state;
4587 }
4588 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4589 len);
4590 ret = xmlRegCheckCharacter(atom, codepoint);
4591 exec->transcount++;
4592 } while (ret == 1);
4593 if (exec->transcount < atom->min)
4594 ret = 0;
4595
4596 /*
4597 * If the last check failed but one transition was found
4598 * possible, rollback
4599 */
4600 if (ret < 0)
4601 ret = 0;
4602 if (ret == 0) {
4603 goto rollback;
4604 }
4605 }
4606 }
4607 if (ret == 1) {
4608 if (exec->state->nbTrans > exec->transno + 1) {
4609 xmlFARegExecSave(exec);
4610 }
4611 /*
4612 * restart count for expressions like this ((abc){2})*
4613 */
4614 if (trans->count >= 0) {
4615#ifdef DEBUG_REGEXP_EXEC
4616 printf("Reset count %d\n", trans->count);
4617#endif
4618 exec->counts[trans->count] = 0;
4619 }
4620 if (trans->counter >= 0) {
4621#ifdef DEBUG_REGEXP_EXEC
4622 printf("Increasing count %d\n", trans->counter);
4623#endif
4624 exec->counts[trans->counter]++;
4625 }
4626#ifdef DEBUG_REGEXP_EXEC
4627 printf("entering state %d\n", trans->to);
4628#endif
4629 exec->state = exec->comp->states[trans->to];
4630 exec->transno = 0;
4631 if (trans->atom != NULL) {
4632 exec->index += len;
4633 }
4634 goto progress;
4635 } else if (ret < 0) {
4636 exec->status = -4;
4637 break;
4638 }
4639 }
4640 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4641rollback:
4642 /*
4643 * Failed to find a way out
4644 */
4645 exec->determinist = 0;
4646 xmlFARegExecRollBack(exec);
4647 }
4648progress:
4649 continue;
4650 }
4651}
4652#endif
4653/************************************************************************
4654 * *
4655 * Parser for the Schemas Datatype Regular Expressions *
4656 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4657 * *
4658 ************************************************************************/
4659
4666static int
4667xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4668 int cur;
4669 int len;
4670
4671 cur = CUR_SCHAR(ctxt->cur, len);
4672 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4673 (cur == '*') || (cur == '+') || (cur == '(') ||
4674 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4675 (cur == 0x5D) || (cur == 0))
4676 return(-1);
4677 return(cur);
4678}
4679
4696static void
4697xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4698 int cur;
4699 xmlRegAtomType type = (xmlRegAtomType) 0;
4700 xmlChar *blockName = NULL;
4701
4702 cur = CUR;
4703 if (cur == 'L') {
4704 NEXT;
4705 cur = CUR;
4706 if (cur == 'u') {
4707 NEXT;
4708 type = XML_REGEXP_LETTER_UPPERCASE;
4709 } else if (cur == 'l') {
4710 NEXT;
4711 type = XML_REGEXP_LETTER_LOWERCASE;
4712 } else if (cur == 't') {
4713 NEXT;
4714 type = XML_REGEXP_LETTER_TITLECASE;
4715 } else if (cur == 'm') {
4716 NEXT;
4717 type = XML_REGEXP_LETTER_MODIFIER;
4718 } else if (cur == 'o') {
4719 NEXT;
4720 type = XML_REGEXP_LETTER_OTHERS;
4721 } else {
4722 type = XML_REGEXP_LETTER;
4723 }
4724 } else if (cur == 'M') {
4725 NEXT;
4726 cur = CUR;
4727 if (cur == 'n') {
4728 NEXT;
4729 /* nonspacing */
4730 type = XML_REGEXP_MARK_NONSPACING;
4731 } else if (cur == 'c') {
4732 NEXT;
4733 /* spacing combining */
4734 type = XML_REGEXP_MARK_SPACECOMBINING;
4735 } else if (cur == 'e') {
4736 NEXT;
4737 /* enclosing */
4738 type = XML_REGEXP_MARK_ENCLOSING;
4739 } else {
4740 /* all marks */
4741 type = XML_REGEXP_MARK;
4742 }
4743 } else if (cur == 'N') {
4744 NEXT;
4745 cur = CUR;
4746 if (cur == 'd') {
4747 NEXT;
4748 /* digital */
4749 type = XML_REGEXP_NUMBER_DECIMAL;
4750 } else if (cur == 'l') {
4751 NEXT;
4752 /* letter */
4753 type = XML_REGEXP_NUMBER_LETTER;
4754 } else if (cur == 'o') {
4755 NEXT;
4756 /* other */
4757 type = XML_REGEXP_NUMBER_OTHERS;
4758 } else {
4759 /* all numbers */
4760 type = XML_REGEXP_NUMBER;
4761 }
4762 } else if (cur == 'P') {
4763 NEXT;
4764 cur = CUR;
4765 if (cur == 'c') {
4766 NEXT;
4767 /* connector */
4768 type = XML_REGEXP_PUNCT_CONNECTOR;
4769 } else if (cur == 'd') {
4770 NEXT;
4771 /* dash */
4772 type = XML_REGEXP_PUNCT_DASH;
4773 } else if (cur == 's') {
4774 NEXT;
4775 /* open */
4776 type = XML_REGEXP_PUNCT_OPEN;
4777 } else if (cur == 'e') {
4778 NEXT;
4779 /* close */
4780 type = XML_REGEXP_PUNCT_CLOSE;
4781 } else if (cur == 'i') {
4782 NEXT;
4783 /* initial quote */
4784 type = XML_REGEXP_PUNCT_INITQUOTE;
4785 } else if (cur == 'f') {
4786 NEXT;
4787 /* final quote */
4788 type = XML_REGEXP_PUNCT_FINQUOTE;
4789 } else if (cur == 'o') {
4790 NEXT;
4791 /* other */
4792 type = XML_REGEXP_PUNCT_OTHERS;
4793 } else {
4794 /* all punctuation */
4795 type = XML_REGEXP_PUNCT;
4796 }
4797 } else if (cur == 'Z') {
4798 NEXT;
4799 cur = CUR;
4800 if (cur == 's') {
4801 NEXT;
4802 /* space */
4803 type = XML_REGEXP_SEPAR_SPACE;
4804 } else if (cur == 'l') {
4805 NEXT;
4806 /* line */
4807 type = XML_REGEXP_SEPAR_LINE;
4808 } else if (cur == 'p') {
4809 NEXT;
4810 /* paragraph */
4811 type = XML_REGEXP_SEPAR_PARA;
4812 } else {
4813 /* all separators */
4814 type = XML_REGEXP_SEPAR;
4815 }
4816 } else if (cur == 'S') {
4817 NEXT;
4818 cur = CUR;
4819 if (cur == 'm') {
4820 NEXT;
4821 type = XML_REGEXP_SYMBOL_MATH;
4822 /* math */
4823 } else if (cur == 'c') {
4824 NEXT;
4825 type = XML_REGEXP_SYMBOL_CURRENCY;
4826 /* currency */
4827 } else if (cur == 'k') {
4828 NEXT;
4829 type = XML_REGEXP_SYMBOL_MODIFIER;
4830 /* modifiers */
4831 } else if (cur == 'o') {
4832 NEXT;
4833 type = XML_REGEXP_SYMBOL_OTHERS;
4834 /* other */
4835 } else {
4836 /* all symbols */
4837 type = XML_REGEXP_SYMBOL;
4838 }
4839 } else if (cur == 'C') {
4840 NEXT;
4841 cur = CUR;
4842 if (cur == 'c') {
4843 NEXT;
4844 /* control */
4845 type = XML_REGEXP_OTHER_CONTROL;
4846 } else if (cur == 'f') {
4847 NEXT;
4848 /* format */
4849 type = XML_REGEXP_OTHER_FORMAT;
4850 } else if (cur == 'o') {
4851 NEXT;
4852 /* private use */
4853 type = XML_REGEXP_OTHER_PRIVATE;
4854 } else if (cur == 'n') {
4855 NEXT;
4856 /* not assigned */
4857 type = XML_REGEXP_OTHER_NA;
4858 } else {
4859 /* all others */
4860 type = XML_REGEXP_OTHER;
4861 }
4862 } else if (cur == 'I') {
4863 const xmlChar *start;
4864 NEXT;
4865 cur = CUR;
4866 if (cur != 's') {
4867 ERROR("IsXXXX expected");
4868 return;
4869 }
4870 NEXT;
4871 start = ctxt->cur;
4872 cur = CUR;
4873 if (((cur >= 'a') && (cur <= 'z')) ||
4874 ((cur >= 'A') && (cur <= 'Z')) ||
4875 ((cur >= '0') && (cur <= '9')) ||
4876 (cur == 0x2D)) {
4877 NEXT;
4878 cur = CUR;
4879 while (((cur >= 'a') && (cur <= 'z')) ||
4880 ((cur >= 'A') && (cur <= 'Z')) ||
4881 ((cur >= '0') && (cur <= '9')) ||
4882 (cur == 0x2D)) {
4883 NEXT;
4884 cur = CUR;
4885 }
4886 }
4887 type = XML_REGEXP_BLOCK_NAME;
4888 blockName = xmlStrndup(start, ctxt->cur - start);
4889 } else {
4890 ERROR("Unknown char property");
4891 return;
4892 }
4893 if (ctxt->atom == NULL) {
4894 ctxt->atom = xmlRegNewAtom(ctxt, type);
4895 if (ctxt->atom != NULL)
4896 ctxt->atom->valuep = blockName;
4897 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4898 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4899 type, 0, 0, blockName);
4900 }
4901}
4902
4903static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)
4904{
4905 int val = 0, i, cur;
4906 for (i = 0; i < 4; i++) {
4907 NEXT;
4908 val *= 16;
4909 cur = CUR;
4910 if (cur >= '0' && cur <= '9') {
4911 val += cur - '0';
4912 } else if (cur >= 'A' && cur <= 'F') {
4913 val += cur - 'A' + 10;
4914 } else if (cur >= 'a' && cur <= 'f') {
4915 val += cur - 'a' + 10;
4916 } else {
4917 ERROR("Expecting hex digit");
4918 return -1;
4919 }
4920 }
4921 return val;
4922}
4923
4924static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)
4925{
4926 int val = parse_escaped_codeunit(ctxt);
4927 if (0xD800 <= val && val <= 0xDBFF) {
4928 NEXT;
4929 if (CUR == '\\') {
4930 NEXT;
4931 if (CUR == 'u') {
4932 int low = parse_escaped_codeunit(ctxt);
4933 if (0xDC00 <= low && low <= 0xDFFF) {
4934 return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
4935 }
4936 }
4937 }
4938 ERROR("Invalid low surrogate pair code unit");
4939 val = -1;
4940 }
4941 return val;
4942}
4943
4954static void
4955xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4956 int cur;
4957
4958 if (CUR == '.') {
4959 if (ctxt->atom == NULL) {
4960 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4961 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4962 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4963 XML_REGEXP_ANYCHAR, 0, 0, NULL);
4964 }
4965 NEXT;
4966 return;
4967 }
4968 if (CUR != '\\') {
4969 ERROR("Escaped sequence: expecting \\");
4970 return;
4971 }
4972 NEXT;
4973 cur = CUR;
4974 if (cur == 'p') {
4975 NEXT;
4976 if (CUR != '{') {
4977 ERROR("Expecting '{'");
4978 return;
4979 }
4980 NEXT;
4981 xmlFAParseCharProp(ctxt);
4982 if (CUR != '}') {
4983 ERROR("Expecting '}'");
4984 return;
4985 }
4986 NEXT;
4987 } else if (cur == 'P') {
4988 NEXT;
4989 if (CUR != '{') {
4990 ERROR("Expecting '{'");
4991 return;
4992 }
4993 NEXT;
4994 xmlFAParseCharProp(ctxt);
4995 if (ctxt->atom != NULL)
4996 ctxt->atom->neg = 1;
4997 if (CUR != '}') {
4998 ERROR("Expecting '}'");
4999 return;
5000 }
5001 NEXT;
5002 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
5003 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
5004 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
5005 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
5006 (cur == 0x5E) ||
5007
5008 /* Non-standard escape sequences:
5009 * Java 1.8|.NET Core 3.1|MSXML 6 */
5010 (cur == '!') || /* + | + | + */
5011 (cur == '"') || /* + | + | + */
5012 (cur == '#') || /* + | + | + */
5013 (cur == '$') || /* + | + | + */
5014 (cur == '%') || /* + | + | + */
5015 (cur == ',') || /* + | + | + */
5016 (cur == '/') || /* + | + | + */
5017 (cur == ':') || /* + | + | + */
5018 (cur == ';') || /* + | + | + */
5019 (cur == '=') || /* + | + | + */
5020 (cur == '>') || /* | + | + */
5021 (cur == '@') || /* + | + | + */
5022 (cur == '`') || /* + | + | + */
5023 (cur == '~') || /* + | + | + */
5024 (cur == 'u')) { /* | + | + */
5025 if (ctxt->atom == NULL) {
5026 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5027 if (ctxt->atom != NULL) {
5028 switch (cur) {
5029 case 'n':
5030 ctxt->atom->codepoint = '\n';
5031 break;
5032 case 'r':
5033 ctxt->atom->codepoint = '\r';
5034 break;
5035 case 't':
5036 ctxt->atom->codepoint = '\t';
5037 break;
5038 case 'u':
5039 cur = parse_escaped_codepoint(ctxt);
5040 if (cur < 0) {
5041 return;
5042 }
5043 ctxt->atom->codepoint = cur;
5044 break;
5045 default:
5046 ctxt->atom->codepoint = cur;
5047 }
5048 }
5049 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5050 switch (cur) {
5051 case 'n':
5052 cur = '\n';
5053 break;
5054 case 'r':
5055 cur = '\r';
5056 break;
5057 case 't':
5058 cur = '\t';
5059 break;
5060 }
5061 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5062 XML_REGEXP_CHARVAL, cur, cur, NULL);
5063 }
5064 NEXT;
5065 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
5066 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
5067 (cur == 'w') || (cur == 'W')) {
5068 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
5069
5070 switch (cur) {
5071 case 's':
5072 type = XML_REGEXP_ANYSPACE;
5073 break;
5074 case 'S':
5075 type = XML_REGEXP_NOTSPACE;
5076 break;
5077 case 'i':
5078 type = XML_REGEXP_INITNAME;
5079 break;
5080 case 'I':
5081 type = XML_REGEXP_NOTINITNAME;
5082 break;
5083 case 'c':
5084 type = XML_REGEXP_NAMECHAR;
5085 break;
5086 case 'C':
5087 type = XML_REGEXP_NOTNAMECHAR;
5088 break;
5089 case 'd':
5090 type = XML_REGEXP_DECIMAL;
5091 break;
5092 case 'D':
5093 type = XML_REGEXP_NOTDECIMAL;
5094 break;
5095 case 'w':
5096 type = XML_REGEXP_REALCHAR;
5097 break;
5098 case 'W':
5099 type = XML_REGEXP_NOTREALCHAR;
5100 break;
5101 }
5102 NEXT;
5103 if (ctxt->atom == NULL) {
5104 ctxt->atom = xmlRegNewAtom(ctxt, type);
5105 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5106 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5107 type, 0, 0, NULL);
5108 }
5109 } else {
5110 ERROR("Wrong escape sequence, misuse of character '\\'");
5111 }
5112}
5113
5124static void
5125xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
5126 int cur, len;
5127 int start = -1;
5128 int end = -1;
5129
5130 if (CUR == '\0') {
5131 ERROR("Expecting ']'");
5132 return;
5133 }
5134
5135 cur = CUR;
5136 if (cur == '\\') {
5137 NEXT;
5138 cur = CUR;
5139 switch (cur) {
5140 case 'n': start = 0xA; break;
5141 case 'r': start = 0xD; break;
5142 case 't': start = 0x9; break;
5143 case '\\': case '|': case '.': case '-': case '^': case '?':
5144 case '*': case '+': case '{': case '}': case '(': case ')':
5145 case '[': case ']':
5146 start = cur; break;
5147 default:
5148 ERROR("Invalid escape value");
5149 return;
5150 }
5151 end = start;
5152 len = 1;
5153 } else if ((cur != 0x5B) && (cur != 0x5D)) {
5154 end = start = CUR_SCHAR(ctxt->cur, len);
5155 } else {
5156 ERROR("Expecting a char range");
5157 return;
5158 }
5159 /*
5160 * Since we are "inside" a range, we can assume ctxt->cur is past
5161 * the start of ctxt->string, and PREV should be safe
5162 */
5163 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5164 NEXTL(len);
5165 return;
5166 }
5167 NEXTL(len);
5168 cur = CUR;
5169 if ((cur != '-') || (NXT(1) == '[') || (NXT(1) == ']')) {
5170 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5171 XML_REGEXP_CHARVAL, start, end, NULL);
5172 return;
5173 }
5174 NEXT;
5175 cur = CUR;
5176 if (cur == '\\') {
5177 NEXT;
5178 cur = CUR;
5179 switch (cur) {
5180 case 'n': end = 0xA; break;
5181 case 'r': end = 0xD; break;
5182 case 't': end = 0x9; break;
5183 case '\\': case '|': case '.': case '-': case '^': case '?':
5184 case '*': case '+': case '{': case '}': case '(': case ')':
5185 case '[': case ']':
5186 end = cur; break;
5187 default:
5188 ERROR("Invalid escape value");
5189 return;
5190 }
5191 len = 1;
5192 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
5193 end = CUR_SCHAR(ctxt->cur, len);
5194 } else {
5195 ERROR("Expecting the end of a char range");
5196 return;
5197 }
5198
5199 /* TODO check that the values are acceptable character ranges for XML */
5200 if (end < start) {
5201 ERROR("End of range is before start of range");
5202 } else {
5203 NEXTL(len);
5204 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5205 XML_REGEXP_CHARVAL, start, end, NULL);
5206 }
5207 return;
5208}
5209
5216static void
5217xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5218 do {
5219 if (CUR == '\\') {
5220 xmlFAParseCharClassEsc(ctxt);
5221 } else {
5222 xmlFAParseCharRange(ctxt);
5223 }
5224 } while ((CUR != ']') && (CUR != '-') &&
5225 (CUR != 0) && (ctxt->error == 0));
5226}
5227
5237static void
5238xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5239 int neg = ctxt->neg;
5240
5241 if (CUR == '^') {
5242 NEXT;
5243 ctxt->neg = !ctxt->neg;
5244 xmlFAParsePosCharGroup(ctxt);
5245 ctxt->neg = neg;
5246 }
5247 while ((CUR != ']') && (ctxt->error == 0)) {
5248 if ((CUR == '-') && (NXT(1) == '[')) {
5249 NEXT; /* eat the '-' */
5250 NEXT; /* eat the '[' */
5251 ctxt->neg = 2;
5252 xmlFAParseCharGroup(ctxt);
5253 ctxt->neg = neg;
5254 if (CUR == ']') {
5255 NEXT;
5256 } else {
5257 ERROR("charClassExpr: ']' expected");
5258 }
5259 break;
5260 } else {
5261 xmlFAParsePosCharGroup(ctxt);
5262 }
5263 }
5264}
5265
5273static void
5274xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5275 if (CUR == '[') {
5276 NEXT;
5277 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5278 if (ctxt->atom == NULL)
5279 return;
5280 xmlFAParseCharGroup(ctxt);
5281 if (CUR == ']') {
5282 NEXT;
5283 } else {
5284 ERROR("xmlFAParseCharClass: ']' expected");
5285 }
5286 } else {
5287 xmlFAParseCharClassEsc(ctxt);
5288 }
5289}
5290
5299static int
5300xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5301 int ret = 0;
5302 int ok = 0;
5303 int overflow = 0;
5304
5305 while ((CUR >= '0') && (CUR <= '9')) {
5306 if (ret > INT_MAX / 10) {
5307 overflow = 1;
5308 } else {
5309 int digit = CUR - '0';
5310
5311 ret *= 10;
5312 if (ret > INT_MAX - digit)
5313 overflow = 1;
5314 else
5315 ret += digit;
5316 }
5317 ok = 1;
5318 NEXT;
5319 }
5320 if ((ok != 1) || (overflow == 1)) {
5321 return(-1);
5322 }
5323 return(ret);
5324}
5325
5336static int
5337xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5338 int cur;
5339
5340 cur = CUR;
5341 if ((cur == '?') || (cur == '*') || (cur == '+')) {
5342 if (ctxt->atom != NULL) {
5343 if (cur == '?')
5344 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5345 else if (cur == '*')
5346 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5347 else if (cur == '+')
5348 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5349 }
5350 NEXT;
5351 return(1);
5352 }
5353 if (cur == '{') {
5354 int min = 0, max = 0;
5355
5356 NEXT;
5357 cur = xmlFAParseQuantExact(ctxt);
5358 if (cur >= 0)
5359 min = cur;
5360 else {
5361 ERROR("Improper quantifier");
5362 }
5363 if (CUR == ',') {
5364 NEXT;
5365 if (CUR == '}')
5366 max = INT_MAX;
5367 else {
5368 cur = xmlFAParseQuantExact(ctxt);
5369 if (cur >= 0)
5370 max = cur;
5371 else {
5372 ERROR("Improper quantifier");
5373 }
5374 }
5375 }
5376 if (CUR == '}') {
5377 NEXT;
5378 } else {
5379 ERROR("Unterminated quantifier");
5380 }
5381 if (max == 0)
5382 max = min;
5383 if (ctxt->atom != NULL) {
5384 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5385 ctxt->atom->min = min;
5386 ctxt->atom->max = max;
5387 }
5388 return(1);
5389 }
5390 return(0);
5391}
5392
5399static int
5400xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5401 int codepoint, len;
5402
5403 codepoint = xmlFAIsChar(ctxt);
5404 if (codepoint > 0) {
5405 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5406 if (ctxt->atom == NULL)
5407 return(-1);
5408 codepoint = CUR_SCHAR(ctxt->cur, len);
5409 ctxt->atom->codepoint = codepoint;
5410 NEXTL(len);
5411 return(1);
5412 } else if (CUR == '|') {
5413 return(0);
5414 } else if (CUR == 0) {
5415 return(0);
5416 } else if (CUR == ')') {
5417 return(0);
5418 } else if (CUR == '(') {
5419 xmlRegStatePtr start, oldend, start0;
5420
5421 NEXT;
5422 if (ctxt->depth >= 50) {
5423 ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5424 return(-1);
5425 }
5426 /*
5427 * this extra Epsilon transition is needed if we count with 0 allowed
5428 * unfortunately this can't be known at that point
5429 */
5430 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5431 start0 = ctxt->state;
5432 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5433 start = ctxt->state;
5434 oldend = ctxt->end;
5435 ctxt->end = NULL;
5436 ctxt->atom = NULL;
5437 ctxt->depth++;
5438 xmlFAParseRegExp(ctxt, 0);
5439 ctxt->depth--;
5440 if (CUR == ')') {
5441 NEXT;
5442 } else {
5443 ERROR("xmlFAParseAtom: expecting ')'");
5444 }
5445 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5446 if (ctxt->atom == NULL)
5447 return(-1);
5448 ctxt->atom->start = start;
5449 ctxt->atom->start0 = start0;
5450 ctxt->atom->stop = ctxt->state;
5451 ctxt->end = oldend;
5452 return(1);
5453 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5454 xmlFAParseCharClass(ctxt);
5455 return(1);
5456 }
5457 return(0);
5458}
5459
5466static int
5467xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5468 int ret;
5469
5470 ctxt->atom = NULL;
5471 ret = xmlFAParseAtom(ctxt);
5472 if (ret == 0)
5473 return(0);
5474 if (ctxt->atom == NULL) {
5475 ERROR("internal: no atom generated");
5476 }
5477 xmlFAParseQuantifier(ctxt);
5478 return(1);
5479}
5480
5491static int
5492xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5493 xmlRegStatePtr previous;
5494 int ret;
5495
5496 previous = ctxt->state;
5497 ret = xmlFAParsePiece(ctxt);
5498 if (ret == 0) {
5499 /* Empty branch */
5500 xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5501 } else {
5502 if (xmlFAGenerateTransitions(ctxt, previous,
5503 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0)
5504 return(-1);
5505 previous = ctxt->state;
5506 ctxt->atom = NULL;
5507 }
5508 while ((ret != 0) && (ctxt->error == 0)) {
5509 ret = xmlFAParsePiece(ctxt);
5510 if (ret != 0) {
5511 if (xmlFAGenerateTransitions(ctxt, previous,
5512 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5513 ctxt->atom) < 0)
5514 return(-1);
5515 previous = ctxt->state;
5516 ctxt->atom = NULL;
5517 }
5518 }
5519 return(0);
5520}
5521
5529static void
5530xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5531 xmlRegStatePtr start, end;
5532
5533 /* if not top start should have been generated by an epsilon trans */
5534 start = ctxt->state;
5535 ctxt->end = NULL;
5536 xmlFAParseBranch(ctxt, NULL);
5537 if (top) {
5538#ifdef DEBUG_REGEXP_GRAPH
5539 printf("State %d is final\n", ctxt->state->no);
5540#endif
5541 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5542 }
5543 if (CUR != '|') {
5544 ctxt->end = ctxt->state;
5545 return;
5546 }
5547 end = ctxt->state;
5548 while ((CUR == '|') && (ctxt->error == 0)) {
5549 NEXT;
5550 ctxt->state = start;
5551 ctxt->end = NULL;
5552 xmlFAParseBranch(ctxt, end);
5553 }
5554 if (!top) {
5555 ctxt->state = end;
5556 ctxt->end = end;
5557 }
5558}
5559
5560/************************************************************************
5561 * *
5562 * The basic API *
5563 * *
5564 ************************************************************************/
5565
5573void
5574xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5575 int i;
5576
5577 if (output == NULL)
5578 return;
5579 fprintf(output, " regexp: ");
5580 if (regexp == NULL) {
5581 fprintf(output, "NULL\n");
5582 return;
5583 }
5584 fprintf(output, "'%s' ", regexp->string);
5585 fprintf(output, "\n");
5586 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5587 for (i = 0;i < regexp->nbAtoms; i++) {
5588 fprintf(output, " %02d ", i);
5589 xmlRegPrintAtom(output, regexp->atoms[i]);
5590 }
5591 fprintf(output, "%d states:", regexp->nbStates);
5592 fprintf(output, "\n");
5593 for (i = 0;i < regexp->nbStates; i++) {
5594 xmlRegPrintState(output, regexp->states[i]);
5595 }
5596 fprintf(output, "%d counters:\n", regexp->nbCounters);
5597 for (i = 0;i < regexp->nbCounters; i++) {
5598 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5599 regexp->counters[i].max);
5600 }
5601}
5602
5613xmlRegexpPtr
5614xmlRegexpCompile(const xmlChar *regexp) {
5615 xmlRegexpPtr ret;
5616 xmlRegParserCtxtPtr ctxt;
5617
5618 ctxt = xmlRegNewParserCtxt(regexp);
5619 if (ctxt == NULL)
5620 return(NULL);
5621
5622 /* initialize the parser */
5623 ctxt->end = NULL;
5624 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5625 xmlRegStatePush(ctxt, ctxt->start);
5626
5627 /* parse the expression building an automata */
5628 xmlFAParseRegExp(ctxt, 1);
5629 if (CUR != 0) {
5630 ERROR("xmlFAParseRegExp: extra characters");
5631 }
5632 if (ctxt->error != 0) {
5633 xmlRegFreeParserCtxt(ctxt);
5634 return(NULL);
5635 }
5636 ctxt->end = ctxt->state;
5637 ctxt->start->type = XML_REGEXP_START_STATE;
5638 ctxt->end->type = XML_REGEXP_FINAL_STATE;
5639
5640 /* remove the Epsilon except for counted transitions */
5641 xmlFAEliminateEpsilonTransitions(ctxt);
5642
5643
5644 if (ctxt->error != 0) {
5645 xmlRegFreeParserCtxt(ctxt);
5646 return(NULL);
5647 }
5648 ret = xmlRegEpxFromParse(ctxt);
5649 xmlRegFreeParserCtxt(ctxt);
5650 return(ret);
5651}
5652
5662int
5663xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5664 if ((comp == NULL) || (content == NULL))
5665 return(-1);
5666 return(xmlFARegExec(comp, content));
5667}
5668
5677int
5678xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5679 xmlAutomataPtr am;
5680 int ret;
5681
5682 if (comp == NULL)
5683 return(-1);
5684 if (comp->determinist != -1)
5685 return(comp->determinist);
5686
5687 am = xmlNewAutomata();
5688 if (am == NULL)
5689 return(-1);
5690 if (am->states != NULL) {
5691 int i;
5692
5693 for (i = 0;i < am->nbStates;i++)
5694 xmlRegFreeState(am->states[i]);
5695 xmlFree(am->states);
5696 }
5697 am->nbAtoms = comp->nbAtoms;
5698 am->atoms = comp->atoms;
5699 am->nbStates = comp->nbStates;
5700 am->states = comp->states;
5701 am->determinist = -1;
5702 am->flags = comp->flags;
5703 ret = xmlFAComputesDeterminism(am);
5704 am->atoms = NULL;
5705 am->states = NULL;
5706 xmlFreeAutomata(am);
5707 comp->determinist = ret;
5708 return(ret);
5709}
5710
5717void
5718xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5719 int i;
5720 if (regexp == NULL)
5721 return;
5722
5723 if (regexp->string != NULL)
5724 xmlFree(regexp->string);
5725 if (regexp->states != NULL) {
5726 for (i = 0;i < regexp->nbStates;i++)
5727 xmlRegFreeState(regexp->states[i]);
5728 xmlFree(regexp->states);
5729 }
5730 if (regexp->atoms != NULL) {
5731 for (i = 0;i < regexp->nbAtoms;i++)
5732 xmlRegFreeAtom(regexp->atoms[i]);
5733 xmlFree(regexp->atoms);
5734 }
5735 if (regexp->counters != NULL)
5736 xmlFree(regexp->counters);
5737 if (regexp->compact != NULL)
5738 xmlFree(regexp->compact);
5739 if (regexp->transdata != NULL)
5740 xmlFree(regexp->transdata);
5741 if (regexp->stringMap != NULL) {
5742 for (i = 0; i < regexp->nbstrings;i++)
5743 xmlFree(regexp->stringMap[i]);
5744 xmlFree(regexp->stringMap);
5745 }
5746
5747 xmlFree(regexp);
5748}
5749
5750#ifdef LIBXML_AUTOMATA_ENABLED
5751/************************************************************************
5752 * *
5753 * The Automata interface *
5754 * *
5755 ************************************************************************/
5756
5764xmlAutomataPtr
5765xmlNewAutomata(void) {
5766 xmlAutomataPtr ctxt;
5767
5768 ctxt = xmlRegNewParserCtxt(NULL);
5769 if (ctxt == NULL)
5770 return(NULL);
5771
5772 /* initialize the parser */
5773 ctxt->end = NULL;
5774 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5775 if (ctxt->start == NULL) {
5776 xmlFreeAutomata(ctxt);
5777 return(NULL);
5778 }
5779 ctxt->start->type = XML_REGEXP_START_STATE;
5780 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5781 xmlRegFreeState(ctxt->start);
5782 xmlFreeAutomata(ctxt);
5783 return(NULL);
5784 }
5785 ctxt->flags = 0;
5786
5787 return(ctxt);
5788}
5789
5796void
5797xmlFreeAutomata(xmlAutomataPtr am) {
5798 if (am == NULL)
5799 return;
5800 xmlRegFreeParserCtxt(am);
5801}
5802
5810void
5811xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5812 if (am == NULL)
5813 return;
5814 am->flags |= flags;
5815}
5816
5825xmlAutomataStatePtr
5826xmlAutomataGetInitState(xmlAutomataPtr am) {
5827 if (am == NULL)
5828 return(NULL);
5829 return(am->start);
5830}
5831
5841int
5842xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5843 if ((am == NULL) || (state == NULL))
5844 return(-1);
5845 state->type = XML_REGEXP_FINAL_STATE;
5846 return(0);
5847}
5848
5863xmlAutomataStatePtr
5864xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5865 xmlAutomataStatePtr to, const xmlChar *token,
5866 void *data) {
5867 xmlRegAtomPtr atom;
5868
5869 if ((am == NULL) || (from == NULL) || (token == NULL))
5870 return(NULL);
5871 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5872 if (a