ReactOS  0.4.15-dev-3303-g1ade494
xmlregexp.c
Go to the documentation of this file.
1 /*
2  * regexp.c: generic and extensible Regular Expression engine
3  *
4  * Basically designed with the purpose of compiling regexps for
5  * the variety of validation/schemas mechanisms now available in
6  * XML related specifications these include:
7  * - XML-1.0 DTD validation
8  * - XML Schemas structure part 1
9  * - XML Schemas Datatypes part 2 especially Appendix F
10  * - RELAX-NG/TREX i.e. the counter proposal
11  *
12  * See Copyright for the status of this software.
13  *
14  * Daniel Veillard <veillard@redhat.com>
15  */
16 
17 #define IN_LIBXML
18 #include "libxml.h"
19 
20 #ifdef LIBXML_REGEXP_ENABLED
21 
22 /* #define DEBUG_ERR */
23 
24 #include <stdio.h>
25 #include <string.h>
26 #ifdef HAVE_LIMITS_H
27 #include <limits.h>
28 #endif
29 #ifdef HAVE_STDINT_H
30 #include <stdint.h>
31 #endif
32 
33 #include <libxml/tree.h>
34 #include <libxml/parserInternals.h>
35 #include <libxml/xmlregexp.h>
36 #include <libxml/xmlautomata.h>
37 #include <libxml/xmlunicode.h>
38 
39 #ifndef INT_MAX
40 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */
41 #endif
42 #ifndef SIZE_MAX
43 #define SIZE_MAX ((size_t) -1)
44 #endif
45 
46 /* #define DEBUG_REGEXP_GRAPH */
47 /* #define DEBUG_REGEXP_EXEC */
48 /* #define DEBUG_PUSH */
49 /* #define DEBUG_COMPACTION */
50 
51 #define MAX_PUSH 10000000
52 
53 #ifdef ERROR
54 #undef ERROR
55 #endif
56 #define ERROR(str) \
57  ctxt->error = XML_REGEXP_COMPILE_ERROR; \
58  xmlRegexpErrCompile(ctxt, str);
59 #define NEXT ctxt->cur++
60 #define CUR (*(ctxt->cur))
61 #define NXT(index) (ctxt->cur[index])
62 
63 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
64 #define NEXTL(l) ctxt->cur += l;
65 #define XML_REG_STRING_SEPARATOR '|'
66 /*
67  * Need PREV to check on a '-' within a Character Group. May only be used
68  * when it's guaranteed that cur is not at the beginning of ctxt->string!
69  */
70 #define PREV (ctxt->cur[-1])
71 
77 #define TODO \
78  xmlGenericError(xmlGenericErrorContext, \
79  "Unimplemented block at %s:%d\n", \
80  __FILE__, __LINE__);
81 
82 /************************************************************************
83  * *
84  * Datatypes and structures *
85  * *
86  ************************************************************************/
87 
88 /*
89  * Note: the order of the enums below is significant, do not shuffle
90  */
91 typedef enum {
92  XML_REGEXP_EPSILON = 1,
93  XML_REGEXP_CHARVAL,
94  XML_REGEXP_RANGES,
95  XML_REGEXP_SUBREG, /* used for () sub regexps */
96  XML_REGEXP_STRING,
97  XML_REGEXP_ANYCHAR, /* . */
98  XML_REGEXP_ANYSPACE, /* \s */
99  XML_REGEXP_NOTSPACE, /* \S */
100  XML_REGEXP_INITNAME, /* \l */
101  XML_REGEXP_NOTINITNAME, /* \L */
102  XML_REGEXP_NAMECHAR, /* \c */
103  XML_REGEXP_NOTNAMECHAR, /* \C */
104  XML_REGEXP_DECIMAL, /* \d */
105  XML_REGEXP_NOTDECIMAL, /* \D */
106  XML_REGEXP_REALCHAR, /* \w */
107  XML_REGEXP_NOTREALCHAR, /* \W */
108  XML_REGEXP_LETTER = 100,
109  XML_REGEXP_LETTER_UPPERCASE,
110  XML_REGEXP_LETTER_LOWERCASE,
111  XML_REGEXP_LETTER_TITLECASE,
112  XML_REGEXP_LETTER_MODIFIER,
113  XML_REGEXP_LETTER_OTHERS,
114  XML_REGEXP_MARK,
115  XML_REGEXP_MARK_NONSPACING,
116  XML_REGEXP_MARK_SPACECOMBINING,
117  XML_REGEXP_MARK_ENCLOSING,
118  XML_REGEXP_NUMBER,
119  XML_REGEXP_NUMBER_DECIMAL,
120  XML_REGEXP_NUMBER_LETTER,
121  XML_REGEXP_NUMBER_OTHERS,
122  XML_REGEXP_PUNCT,
123  XML_REGEXP_PUNCT_CONNECTOR,
124  XML_REGEXP_PUNCT_DASH,
125  XML_REGEXP_PUNCT_OPEN,
126  XML_REGEXP_PUNCT_CLOSE,
127  XML_REGEXP_PUNCT_INITQUOTE,
128  XML_REGEXP_PUNCT_FINQUOTE,
129  XML_REGEXP_PUNCT_OTHERS,
130  XML_REGEXP_SEPAR,
131  XML_REGEXP_SEPAR_SPACE,
132  XML_REGEXP_SEPAR_LINE,
133  XML_REGEXP_SEPAR_PARA,
134  XML_REGEXP_SYMBOL,
135  XML_REGEXP_SYMBOL_MATH,
136  XML_REGEXP_SYMBOL_CURRENCY,
137  XML_REGEXP_SYMBOL_MODIFIER,
138  XML_REGEXP_SYMBOL_OTHERS,
139  XML_REGEXP_OTHER,
140  XML_REGEXP_OTHER_CONTROL,
141  XML_REGEXP_OTHER_FORMAT,
142  XML_REGEXP_OTHER_PRIVATE,
143  XML_REGEXP_OTHER_NA,
144  XML_REGEXP_BLOCK_NAME
145 } xmlRegAtomType;
146 
147 typedef enum {
148  XML_REGEXP_QUANT_EPSILON = 1,
149  XML_REGEXP_QUANT_ONCE,
150  XML_REGEXP_QUANT_OPT,
151  XML_REGEXP_QUANT_MULT,
152  XML_REGEXP_QUANT_PLUS,
153  XML_REGEXP_QUANT_ONCEONLY,
154  XML_REGEXP_QUANT_ALL,
155  XML_REGEXP_QUANT_RANGE
156 } xmlRegQuantType;
157 
158 typedef enum {
159  XML_REGEXP_START_STATE = 1,
160  XML_REGEXP_FINAL_STATE,
161  XML_REGEXP_TRANS_STATE,
162  XML_REGEXP_SINK_STATE,
163  XML_REGEXP_UNREACH_STATE
164 } xmlRegStateType;
165 
166 typedef enum {
167  XML_REGEXP_MARK_NORMAL = 0,
168  XML_REGEXP_MARK_START,
169  XML_REGEXP_MARK_VISITED
170 } xmlRegMarkedType;
171 
172 typedef struct _xmlRegRange xmlRegRange;
173 typedef xmlRegRange *xmlRegRangePtr;
174 
175 struct _xmlRegRange {
176  int neg; /* 0 normal, 1 not, 2 exclude */
177  xmlRegAtomType type;
178  int start;
179  int end;
180  xmlChar *blockName;
181 };
182 
183 typedef struct _xmlRegAtom xmlRegAtom;
184 typedef xmlRegAtom *xmlRegAtomPtr;
185 
186 typedef struct _xmlAutomataState xmlRegState;
187 typedef xmlRegState *xmlRegStatePtr;
188 
189 struct _xmlRegAtom {
190  int no;
191  xmlRegAtomType type;
192  xmlRegQuantType quant;
193  int min;
194  int max;
195 
196  void *valuep;
197  void *valuep2;
198  int neg;
199  int codepoint;
200  xmlRegStatePtr start;
201  xmlRegStatePtr start0;
202  xmlRegStatePtr stop;
203  int maxRanges;
204  int nbRanges;
205  xmlRegRangePtr *ranges;
206  void *data;
207 };
208 
209 typedef struct _xmlRegCounter xmlRegCounter;
210 typedef xmlRegCounter *xmlRegCounterPtr;
211 
212 struct _xmlRegCounter {
213  int min;
214  int max;
215 };
216 
217 typedef struct _xmlRegTrans xmlRegTrans;
218 typedef xmlRegTrans *xmlRegTransPtr;
219 
220 struct _xmlRegTrans {
221  xmlRegAtomPtr atom;
222  int to;
223  int counter;
224  int count;
225  int nd;
226 };
227 
228 struct _xmlAutomataState {
229  xmlRegStateType type;
230  xmlRegMarkedType mark;
231  xmlRegMarkedType markd;
232  xmlRegMarkedType reached;
233  int no;
234  int maxTrans;
235  int nbTrans;
236  xmlRegTrans *trans;
237  /* knowing states pointing to us can speed things up */
238  int maxTransTo;
239  int nbTransTo;
240  int *transTo;
241 };
242 
243 typedef struct _xmlAutomata xmlRegParserCtxt;
244 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
245 
246 #define AM_AUTOMATA_RNG 1
247 
248 struct _xmlAutomata {
249  xmlChar *string;
250  xmlChar *cur;
251 
252  int error;
253  int neg;
254 
255  xmlRegStatePtr start;
256  xmlRegStatePtr end;
257  xmlRegStatePtr state;
258 
259  xmlRegAtomPtr atom;
260 
261  int maxAtoms;
262  int nbAtoms;
263  xmlRegAtomPtr *atoms;
264 
265  int maxStates;
266  int nbStates;
267  xmlRegStatePtr *states;
268 
269  int maxCounters;
270  int nbCounters;
271  xmlRegCounter *counters;
272 
273  int determinist;
274  int negs;
275  int flags;
276 
277  int depth;
278 };
279 
280 struct _xmlRegexp {
281  xmlChar *string;
282  int nbStates;
283  xmlRegStatePtr *states;
284  int nbAtoms;
285  xmlRegAtomPtr *atoms;
286  int nbCounters;
287  xmlRegCounter *counters;
288  int determinist;
289  int flags;
290  /*
291  * That's the compact form for determinists automatas
292  */
293  int nbstates;
294  int *compact;
295  void **transdata;
296  int nbstrings;
297  xmlChar **stringMap;
298 };
299 
300 typedef struct _xmlRegExecRollback xmlRegExecRollback;
301 typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
302 
303 struct _xmlRegExecRollback {
304  xmlRegStatePtr state;/* the current state */
305  int index; /* the index in the input stack */
306  int nextbranch; /* the next transition to explore in that state */
307  int *counts; /* save the automata state if it has some */
308 };
309 
310 typedef struct _xmlRegInputToken xmlRegInputToken;
311 typedef xmlRegInputToken *xmlRegInputTokenPtr;
312 
313 struct _xmlRegInputToken {
314  xmlChar *value;
315  void *data;
316 };
317 
318 struct _xmlRegExecCtxt {
319  int status; /* execution status != 0 indicate an error */
320  int determinist; /* did we find an indeterministic behaviour */
321  xmlRegexpPtr comp; /* the compiled regexp */
322  xmlRegExecCallbacks callback;
323  void *data;
324 
325  xmlRegStatePtr state;/* the current state */
326  int transno; /* the current transition on that state */
327  int transcount; /* the number of chars in char counted transitions */
328 
329  /*
330  * A stack of rollback states
331  */
332  int maxRollbacks;
333  int nbRollbacks;
334  xmlRegExecRollback *rollbacks;
335 
336  /*
337  * The state of the automata if any
338  */
339  int *counts;
340 
341  /*
342  * The input stack
343  */
344  int inputStackMax;
345  int inputStackNr;
346  int index;
347  int *charStack;
348  const xmlChar *inputString; /* when operating on characters */
349  xmlRegInputTokenPtr inputStack;/* when operating on strings */
350 
351  /*
352  * error handling
353  */
354  int errStateNo; /* the error state number */
355  xmlRegStatePtr errState; /* the error state */
356  xmlChar *errString; /* the string raising the error */
357  int *errCounts; /* counters at the error state */
358  int nbPush;
359 };
360 
361 #define REGEXP_ALL_COUNTER 0x123456
362 #define REGEXP_ALL_LAX_COUNTER 0x123457
363 
364 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
365 static void xmlRegFreeState(xmlRegStatePtr state);
366 static void xmlRegFreeAtom(xmlRegAtomPtr atom);
367 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
368 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
369 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
370  int neg, int start, int end, const xmlChar *blockName);
371 
372 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
373 
374 /************************************************************************
375  * *
376  * Regexp memory error handler *
377  * *
378  ************************************************************************/
385 static void
386 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
387 {
388  const char *regexp = NULL;
389  if (ctxt != NULL) {
390  regexp = (const char *) ctxt->string;
391  ctxt->error = XML_ERR_NO_MEMORY;
392  }
393  __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
395  regexp, NULL, 0, 0,
396  "Memory allocation failed : %s\n", extra);
397 }
398 
405 static void
406 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
407 {
408  const char *regexp = NULL;
409  int idx = 0;
410 
411  if (ctxt != NULL) {
412  regexp = (const char *) ctxt->string;
413  idx = ctxt->cur - ctxt->string;
414  ctxt->error = XML_REGEXP_COMPILE_ERROR;
415  }
416  __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
418  regexp, NULL, idx, 0,
419  "failed to compile: %s\n", extra);
420 }
421 
422 /************************************************************************
423  * *
424  * Allocation/Deallocation *
425  * *
426  ************************************************************************/
427 
428 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
429 
440 static void*
441 xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
442  size_t totalSize;
443  void *ret;
444 
445  /* Check for overflow */
446  if (dim1 > SIZE_MAX / dim2 / elemSize)
447  return (NULL);
448  totalSize = dim1 * dim2 * elemSize;
449  ret = xmlMalloc(totalSize);
450  if (ret != NULL)
451  memset(ret, 0, totalSize);
452  return (ret);
453 }
454 
463 static xmlRegexpPtr
464 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
465  xmlRegexpPtr ret;
466 
467  ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
468  if (ret == NULL) {
469  xmlRegexpErrMemory(ctxt, "compiling regexp");
470  return(NULL);
471  }
472  memset(ret, 0, sizeof(xmlRegexp));
473  ret->string = ctxt->string;
474  ret->nbStates = ctxt->nbStates;
475  ret->states = ctxt->states;
476  ret->nbAtoms = ctxt->nbAtoms;
477  ret->atoms = ctxt->atoms;
478  ret->nbCounters = ctxt->nbCounters;
479  ret->counters = ctxt->counters;
480  ret->determinist = ctxt->determinist;
481  ret->flags = ctxt->flags;
482  if (ret->determinist == -1) {
483  xmlRegexpIsDeterminist(ret);
484  }
485 
486  if ((ret->determinist != 0) &&
487  (ret->nbCounters == 0) &&
488  (ctxt->negs == 0) &&
489  (ret->atoms != NULL) &&
490  (ret->atoms[0] != NULL) &&
491  (ret->atoms[0]->type == XML_REGEXP_STRING)) {
492  int i, j, nbstates = 0, nbatoms = 0;
493  int *stateRemap;
494  int *stringRemap;
495  int *transitions;
496  void **transdata;
497  xmlChar **stringMap;
498  xmlChar *value;
499 
500  /*
501  * Switch to a compact representation
502  * 1/ counting the effective number of states left
503  * 2/ counting the unique number of atoms, and check that
504  * they are all of the string type
505  * 3/ build a table state x atom for the transitions
506  */
507 
508  stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
509  if (stateRemap == NULL) {
510  xmlRegexpErrMemory(ctxt, "compiling regexp");
511  xmlFree(ret);
512  return(NULL);
513  }
514  for (i = 0;i < ret->nbStates;i++) {
515  if (ret->states[i] != NULL) {
516  stateRemap[i] = nbstates;
517  nbstates++;
518  } else {
519  stateRemap[i] = -1;
520  }
521  }
522 #ifdef DEBUG_COMPACTION
523  printf("Final: %d states\n", nbstates);
524 #endif
525  stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
526  if (stringMap == NULL) {
527  xmlRegexpErrMemory(ctxt, "compiling regexp");
528  xmlFree(stateRemap);
529  xmlFree(ret);
530  return(NULL);
531  }
532  stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
533  if (stringRemap == NULL) {
534  xmlRegexpErrMemory(ctxt, "compiling regexp");
535  xmlFree(stringMap);
536  xmlFree(stateRemap);
537  xmlFree(ret);
538  return(NULL);
539  }
540  for (i = 0;i < ret->nbAtoms;i++) {
541  if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
542  (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
543  value = ret->atoms[i]->valuep;
544  for (j = 0;j < nbatoms;j++) {
545  if (xmlStrEqual(stringMap[j], value)) {
546  stringRemap[i] = j;
547  break;
548  }
549  }
550  if (j >= nbatoms) {
551  stringRemap[i] = nbatoms;
552  stringMap[nbatoms] = xmlStrdup(value);
553  if (stringMap[nbatoms] == NULL) {
554  for (i = 0;i < nbatoms;i++)
555  xmlFree(stringMap[i]);
556  xmlFree(stringRemap);
557  xmlFree(stringMap);
558  xmlFree(stateRemap);
559  xmlFree(ret);
560  return(NULL);
561  }
562  nbatoms++;
563  }
564  } else {
565  xmlFree(stateRemap);
566  xmlFree(stringRemap);
567  for (i = 0;i < nbatoms;i++)
568  xmlFree(stringMap[i]);
569  xmlFree(stringMap);
570  xmlFree(ret);
571  return(NULL);
572  }
573  }
574 #ifdef DEBUG_COMPACTION
575  printf("Final: %d atoms\n", nbatoms);
576 #endif
577  transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
578  sizeof(int));
579  if (transitions == NULL) {
580  xmlFree(stateRemap);
581  xmlFree(stringRemap);
582  for (i = 0;i < nbatoms;i++)
583  xmlFree(stringMap[i]);
584  xmlFree(stringMap);
585  xmlFree(ret);
586  return(NULL);
587  }
588 
589  /*
590  * Allocate the transition table. The first entry for each
591  * state corresponds to the state type.
592  */
593  transdata = NULL;
594 
595  for (i = 0;i < ret->nbStates;i++) {
596  int stateno, atomno, targetno, prev;
597  xmlRegStatePtr state;
598  xmlRegTransPtr trans;
599 
600  stateno = stateRemap[i];
601  if (stateno == -1)
602  continue;
603  state = ret->states[i];
604 
605  transitions[stateno * (nbatoms + 1)] = state->type;
606 
607  for (j = 0;j < state->nbTrans;j++) {
608  trans = &(state->trans[j]);
609  if ((trans->to == -1) || (trans->atom == NULL))
610  continue;
611  atomno = stringRemap[trans->atom->no];
612  if ((trans->atom->data != NULL) && (transdata == NULL)) {
613  transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
614  sizeof(void *));
615  if (transdata == NULL) {
616  xmlRegexpErrMemory(ctxt, "compiling regexp");
617  break;
618  }
619  }
620  targetno = stateRemap[trans->to];
621  /*
622  * if the same atom can generate transitions to 2 different
623  * states then it means the automata is not deterministic and
624  * the compact form can't be used !
625  */
626  prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
627  if (prev != 0) {
628  if (prev != targetno + 1) {
629  ret->determinist = 0;
630 #ifdef DEBUG_COMPACTION
631  printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
632  i, j, trans->atom->no, trans->to, atomno, targetno);
633  printf(" previous to is %d\n", prev);
634 #endif
635  if (transdata != NULL)
636  xmlFree(transdata);
637  xmlFree(transitions);
638  xmlFree(stateRemap);
639  xmlFree(stringRemap);
640  for (i = 0;i < nbatoms;i++)
641  xmlFree(stringMap[i]);
642  xmlFree(stringMap);
643  goto not_determ;
644  }
645  } else {
646 #if 0
647  printf("State %d trans %d: atom %d to %d : %d to %d\n",
648  i, j, trans->atom->no, trans->to, atomno, targetno);
649 #endif
650  transitions[stateno * (nbatoms + 1) + atomno + 1] =
651  targetno + 1; /* to avoid 0 */
652  if (transdata != NULL)
653  transdata[stateno * nbatoms + atomno] =
654  trans->atom->data;
655  }
656  }
657  }
658  ret->determinist = 1;
659 #ifdef DEBUG_COMPACTION
660  /*
661  * Debug
662  */
663  for (i = 0;i < nbstates;i++) {
664  for (j = 0;j < nbatoms + 1;j++) {
665  printf("%02d ", transitions[i * (nbatoms + 1) + j]);
666  }
667  printf("\n");
668  }
669  printf("\n");
670 #endif
671  /*
672  * Cleanup of the old data
673  */
674  if (ret->states != NULL) {
675  for (i = 0;i < ret->nbStates;i++)
676  xmlRegFreeState(ret->states[i]);
677  xmlFree(ret->states);
678  }
679  ret->states = NULL;
680  ret->nbStates = 0;
681  if (ret->atoms != NULL) {
682  for (i = 0;i < ret->nbAtoms;i++)
683  xmlRegFreeAtom(ret->atoms[i]);
684  xmlFree(ret->atoms);
685  }
686  ret->atoms = NULL;
687  ret->nbAtoms = 0;
688 
689  ret->compact = transitions;
690  ret->transdata = transdata;
691  ret->stringMap = stringMap;
692  ret->nbstrings = nbatoms;
693  ret->nbstates = nbstates;
694  xmlFree(stateRemap);
695  xmlFree(stringRemap);
696  }
697 not_determ:
698  ctxt->string = NULL;
699  ctxt->nbStates = 0;
700  ctxt->states = NULL;
701  ctxt->nbAtoms = 0;
702  ctxt->atoms = NULL;
703  ctxt->nbCounters = 0;
704  ctxt->counters = NULL;
705  return(ret);
706 }
707 
716 static xmlRegParserCtxtPtr
717 xmlRegNewParserCtxt(const xmlChar *string) {
718  xmlRegParserCtxtPtr ret;
719 
720  ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
721  if (ret == NULL)
722  return(NULL);
723  memset(ret, 0, sizeof(xmlRegParserCtxt));
724  if (string != NULL)
725  ret->string = xmlStrdup(string);
726  ret->cur = ret->string;
727  ret->neg = 0;
728  ret->negs = 0;
729  ret->error = 0;
730  ret->determinist = -1;
731  return(ret);
732 }
733 
746 static xmlRegRangePtr
747 xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
748  int neg, xmlRegAtomType type, int start, int end) {
749  xmlRegRangePtr ret;
750 
751  ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
752  if (ret == NULL) {
753  xmlRegexpErrMemory(ctxt, "allocating range");
754  return(NULL);
755  }
756  ret->neg = neg;
757  ret->type = type;
758  ret->start = start;
759  ret->end = end;
760  return(ret);
761 }
762 
769 static void
770 xmlRegFreeRange(xmlRegRangePtr range) {
771  if (range == NULL)
772  return;
773 
774  if (range->blockName != NULL)
775  xmlFree(range->blockName);
776  xmlFree(range);
777 }
778 
787 static xmlRegRangePtr
788 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
789  xmlRegRangePtr ret;
790 
791  if (range == NULL)
792  return(NULL);
793 
794  ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
795  range->end);
796  if (ret == NULL)
797  return(NULL);
798  if (range->blockName != NULL) {
799  ret->blockName = xmlStrdup(range->blockName);
800  if (ret->blockName == NULL) {
801  xmlRegexpErrMemory(ctxt, "allocating range");
802  xmlRegFreeRange(ret);
803  return(NULL);
804  }
805  }
806  return(ret);
807 }
808 
818 static xmlRegAtomPtr
819 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
820  xmlRegAtomPtr ret;
821 
822  ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
823  if (ret == NULL) {
824  xmlRegexpErrMemory(ctxt, "allocating atom");
825  return(NULL);
826  }
827  memset(ret, 0, sizeof(xmlRegAtom));
828  ret->type = type;
829  ret->quant = XML_REGEXP_QUANT_ONCE;
830  ret->min = 0;
831  ret->max = 0;
832  return(ret);
833 }
834 
841 static void
842 xmlRegFreeAtom(xmlRegAtomPtr atom) {
843  int i;
844 
845  if (atom == NULL)
846  return;
847 
848  for (i = 0;i < atom->nbRanges;i++)
849  xmlRegFreeRange(atom->ranges[i]);
850  if (atom->ranges != NULL)
851  xmlFree(atom->ranges);
852  if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
853  xmlFree(atom->valuep);
854  if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
855  xmlFree(atom->valuep2);
856  if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
857  xmlFree(atom->valuep);
858  xmlFree(atom);
859 }
860 
870 static xmlRegAtomPtr
871 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
872  xmlRegAtomPtr ret;
873 
874  ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
875  if (ret == NULL) {
876  xmlRegexpErrMemory(ctxt, "copying atom");
877  return(NULL);
878  }
879  memset(ret, 0, sizeof(xmlRegAtom));
880  ret->type = atom->type;
881  ret->quant = atom->quant;
882  ret->min = atom->min;
883  ret->max = atom->max;
884  if (atom->nbRanges > 0) {
885  int i;
886 
887  ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
888  atom->nbRanges);
889  if (ret->ranges == NULL) {
890  xmlRegexpErrMemory(ctxt, "copying atom");
891  goto error;
892  }
893  for (i = 0;i < atom->nbRanges;i++) {
894  ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
895  if (ret->ranges[i] == NULL)
896  goto error;
897  ret->nbRanges = i + 1;
898  }
899  }
900  return(ret);
901 
902 error:
903  xmlRegFreeAtom(ret);
904  return(NULL);
905 }
906 
907 static xmlRegStatePtr
908 xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
909  xmlRegStatePtr ret;
910 
911  ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
912  if (ret == NULL) {
913  xmlRegexpErrMemory(ctxt, "allocating state");
914  return(NULL);
915  }
916  memset(ret, 0, sizeof(xmlRegState));
917  ret->type = XML_REGEXP_TRANS_STATE;
918  ret->mark = XML_REGEXP_MARK_NORMAL;
919  return(ret);
920 }
921 
928 static void
929 xmlRegFreeState(xmlRegStatePtr state) {
930  if (state == NULL)
931  return;
932 
933  if (state->trans != NULL)
934  xmlFree(state->trans);
935  if (state->transTo != NULL)
936  xmlFree(state->transTo);
937  xmlFree(state);
938 }
939 
946 static void
947 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
948  int i;
949  if (ctxt == NULL)
950  return;
951 
952  if (ctxt->string != NULL)
953  xmlFree(ctxt->string);
954  if (ctxt->states != NULL) {
955  for (i = 0;i < ctxt->nbStates;i++)
956  xmlRegFreeState(ctxt->states[i]);
957  xmlFree(ctxt->states);
958  }
959  if (ctxt->atoms != NULL) {
960  for (i = 0;i < ctxt->nbAtoms;i++)
961  xmlRegFreeAtom(ctxt->atoms[i]);
962  xmlFree(ctxt->atoms);
963  }
964  if (ctxt->counters != NULL)
965  xmlFree(ctxt->counters);
966  xmlFree(ctxt);
967 }
968 
969 /************************************************************************
970  * *
971  * Display of Data structures *
972  * *
973  ************************************************************************/
974 
975 static void
976 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
977  switch (type) {
978  case XML_REGEXP_EPSILON:
979  fprintf(output, "epsilon "); break;
980  case XML_REGEXP_CHARVAL:
981  fprintf(output, "charval "); break;
982  case XML_REGEXP_RANGES:
983  fprintf(output, "ranges "); break;
984  case XML_REGEXP_SUBREG:
985  fprintf(output, "subexpr "); break;
986  case XML_REGEXP_STRING:
987  fprintf(output, "string "); break;
988  case XML_REGEXP_ANYCHAR:
989  fprintf(output, "anychar "); break;
990  case XML_REGEXP_ANYSPACE:
991  fprintf(output, "anyspace "); break;
992  case XML_REGEXP_NOTSPACE:
993  fprintf(output, "notspace "); break;
994  case XML_REGEXP_INITNAME:
995  fprintf(output, "initname "); break;
996  case XML_REGEXP_NOTINITNAME:
997  fprintf(output, "notinitname "); break;
998  case XML_REGEXP_NAMECHAR:
999  fprintf(output, "namechar "); break;
1000  case XML_REGEXP_NOTNAMECHAR:
1001  fprintf(output, "notnamechar "); break;
1002  case XML_REGEXP_DECIMAL:
1003  fprintf(output, "decimal "); break;
1004  case XML_REGEXP_NOTDECIMAL:
1005  fprintf(output, "notdecimal "); break;
1006  case XML_REGEXP_REALCHAR:
1007  fprintf(output, "realchar "); break;
1008  case XML_REGEXP_NOTREALCHAR:
1009  fprintf(output, "notrealchar "); break;
1010  case XML_REGEXP_LETTER:
1011  fprintf(output, "LETTER "); break;
1012  case XML_REGEXP_LETTER_UPPERCASE:
1013  fprintf(output, "LETTER_UPPERCASE "); break;
1014  case XML_REGEXP_LETTER_LOWERCASE:
1015  fprintf(output, "LETTER_LOWERCASE "); break;
1016  case XML_REGEXP_LETTER_TITLECASE:
1017  fprintf(output, "LETTER_TITLECASE "); break;
1018  case XML_REGEXP_LETTER_MODIFIER:
1019  fprintf(output, "LETTER_MODIFIER "); break;
1020  case XML_REGEXP_LETTER_OTHERS:
1021  fprintf(output, "LETTER_OTHERS "); break;
1022  case XML_REGEXP_MARK:
1023  fprintf(output, "MARK "); break;
1024  case XML_REGEXP_MARK_NONSPACING:
1025  fprintf(output, "MARK_NONSPACING "); break;
1026  case XML_REGEXP_MARK_SPACECOMBINING:
1027  fprintf(output, "MARK_SPACECOMBINING "); break;
1028  case XML_REGEXP_MARK_ENCLOSING:
1029  fprintf(output, "MARK_ENCLOSING "); break;
1030  case XML_REGEXP_NUMBER:
1031  fprintf(output, "NUMBER "); break;
1032  case XML_REGEXP_NUMBER_DECIMAL:
1033  fprintf(output, "NUMBER_DECIMAL "); break;
1034  case XML_REGEXP_NUMBER_LETTER:
1035  fprintf(output, "NUMBER_LETTER "); break;
1036  case XML_REGEXP_NUMBER_OTHERS:
1037  fprintf(output, "NUMBER_OTHERS "); break;
1038  case XML_REGEXP_PUNCT:
1039  fprintf(output, "PUNCT "); break;
1040  case XML_REGEXP_PUNCT_CONNECTOR:
1041  fprintf(output, "PUNCT_CONNECTOR "); break;
1042  case XML_REGEXP_PUNCT_DASH:
1043  fprintf(output, "PUNCT_DASH "); break;
1044  case XML_REGEXP_PUNCT_OPEN:
1045  fprintf(output, "PUNCT_OPEN "); break;
1046  case XML_REGEXP_PUNCT_CLOSE:
1047  fprintf(output, "PUNCT_CLOSE "); break;
1048  case XML_REGEXP_PUNCT_INITQUOTE:
1049  fprintf(output, "PUNCT_INITQUOTE "); break;
1050  case XML_REGEXP_PUNCT_FINQUOTE:
1051  fprintf(output, "PUNCT_FINQUOTE "); break;
1052  case XML_REGEXP_PUNCT_OTHERS:
1053  fprintf(output, "PUNCT_OTHERS "); break;
1054  case XML_REGEXP_SEPAR:
1055  fprintf(output, "SEPAR "); break;
1056  case XML_REGEXP_SEPAR_SPACE:
1057  fprintf(output, "SEPAR_SPACE "); break;
1058  case XML_REGEXP_SEPAR_LINE:
1059  fprintf(output, "SEPAR_LINE "); break;
1060  case XML_REGEXP_SEPAR_PARA:
1061  fprintf(output, "SEPAR_PARA "); break;
1062  case XML_REGEXP_SYMBOL:
1063  fprintf(output, "SYMBOL "); break;
1064  case XML_REGEXP_SYMBOL_MATH:
1065  fprintf(output, "SYMBOL_MATH "); break;
1066  case XML_REGEXP_SYMBOL_CURRENCY:
1067  fprintf(output, "SYMBOL_CURRENCY "); break;
1068  case XML_REGEXP_SYMBOL_MODIFIER:
1069  fprintf(output, "SYMBOL_MODIFIER "); break;
1070  case XML_REGEXP_SYMBOL_OTHERS:
1071  fprintf(output, "SYMBOL_OTHERS "); break;
1072  case XML_REGEXP_OTHER:
1073  fprintf(output, "OTHER "); break;
1074  case XML_REGEXP_OTHER_CONTROL:
1075  fprintf(output, "OTHER_CONTROL "); break;
1076  case XML_REGEXP_OTHER_FORMAT:
1077  fprintf(output, "OTHER_FORMAT "); break;
1078  case XML_REGEXP_OTHER_PRIVATE:
1079  fprintf(output, "OTHER_PRIVATE "); break;
1080  case XML_REGEXP_OTHER_NA:
1081  fprintf(output, "OTHER_NA "); break;
1082  case XML_REGEXP_BLOCK_NAME:
1083  fprintf(output, "BLOCK "); break;
1084  }
1085 }
1086 
1087 static void
1088 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1089  switch (type) {
1090  case XML_REGEXP_QUANT_EPSILON:
1091  fprintf(output, "epsilon "); break;
1092  case XML_REGEXP_QUANT_ONCE:
1093  fprintf(output, "once "); break;
1094  case XML_REGEXP_QUANT_OPT:
1095  fprintf(output, "? "); break;
1096  case XML_REGEXP_QUANT_MULT:
1097  fprintf(output, "* "); break;
1098  case XML_REGEXP_QUANT_PLUS:
1099  fprintf(output, "+ "); break;
1100  case XML_REGEXP_QUANT_RANGE:
1101  fprintf(output, "range "); break;
1102  case XML_REGEXP_QUANT_ONCEONLY:
1103  fprintf(output, "onceonly "); break;
1104  case XML_REGEXP_QUANT_ALL:
1105  fprintf(output, "all "); break;
1106  }
1107 }
1108 static void
1109 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1110  fprintf(output, " range: ");
1111  if (range->neg)
1112  fprintf(output, "negative ");
1113  xmlRegPrintAtomType(output, range->type);
1114  fprintf(output, "%c - %c\n", range->start, range->end);
1115 }
1116 
1117 static void
1118 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1119  fprintf(output, " atom: ");
1120  if (atom == NULL) {
1121  fprintf(output, "NULL\n");
1122  return;
1123  }
1124  if (atom->neg)
1125  fprintf(output, "not ");
1126  xmlRegPrintAtomType(output, atom->type);
1127  xmlRegPrintQuantType(output, atom->quant);
1128  if (atom->quant == XML_REGEXP_QUANT_RANGE)
1129  fprintf(output, "%d-%d ", atom->min, atom->max);
1130  if (atom->type == XML_REGEXP_STRING)
1131  fprintf(output, "'%s' ", (char *) atom->valuep);
1132  if (atom->type == XML_REGEXP_CHARVAL)
1133  fprintf(output, "char %c\n", atom->codepoint);
1134  else if (atom->type == XML_REGEXP_RANGES) {
1135  int i;
1136  fprintf(output, "%d entries\n", atom->nbRanges);
1137  for (i = 0; i < atom->nbRanges;i++)
1138  xmlRegPrintRange(output, atom->ranges[i]);
1139  } else if (atom->type == XML_REGEXP_SUBREG) {
1140  fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1141  } else {
1142  fprintf(output, "\n");
1143  }
1144 }
1145 
1146 static void
1147 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1148  fprintf(output, " trans: ");
1149  if (trans == NULL) {
1150  fprintf(output, "NULL\n");
1151  return;
1152  }
1153  if (trans->to < 0) {
1154  fprintf(output, "removed\n");
1155  return;
1156  }
1157  if (trans->nd != 0) {
1158  if (trans->nd == 2)
1159  fprintf(output, "last not determinist, ");
1160  else
1161  fprintf(output, "not determinist, ");
1162  }
1163  if (trans->counter >= 0) {
1164  fprintf(output, "counted %d, ", trans->counter);
1165  }
1166  if (trans->count == REGEXP_ALL_COUNTER) {
1167  fprintf(output, "all transition, ");
1168  } else if (trans->count >= 0) {
1169  fprintf(output, "count based %d, ", trans->count);
1170  }
1171  if (trans->atom == NULL) {
1172  fprintf(output, "epsilon to %d\n", trans->to);
1173  return;
1174  }
1175  if (trans->atom->type == XML_REGEXP_CHARVAL)
1176  fprintf(output, "char %c ", trans->atom->codepoint);
1177  fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1178 }
1179 
1180 static void
1181 xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1182  int i;
1183 
1184  fprintf(output, " state: ");
1185  if (state == NULL) {
1186  fprintf(output, "NULL\n");
1187  return;
1188  }
1189  if (state->type == XML_REGEXP_START_STATE)
1190  fprintf(output, "START ");
1191  if (state->type == XML_REGEXP_FINAL_STATE)
1192  fprintf(output, "FINAL ");
1193 
1194  fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1195  for (i = 0;i < state->nbTrans; i++) {
1196  xmlRegPrintTrans(output, &(state->trans[i]));
1197  }
1198 }
1199 
1200 #ifdef DEBUG_REGEXP_GRAPH
1201 static void
1202 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1203  int i;
1204 
1205  fprintf(output, " ctxt: ");
1206  if (ctxt == NULL) {
1207  fprintf(output, "NULL\n");
1208  return;
1209  }
1210  fprintf(output, "'%s' ", ctxt->string);
1211  if (ctxt->error)
1212  fprintf(output, "error ");
1213  if (ctxt->neg)
1214  fprintf(output, "neg ");
1215  fprintf(output, "\n");
1216  fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1217  for (i = 0;i < ctxt->nbAtoms; i++) {
1218  fprintf(output, " %02d ", i);
1219  xmlRegPrintAtom(output, ctxt->atoms[i]);
1220  }
1221  if (ctxt->atom != NULL) {
1222  fprintf(output, "current atom:\n");
1223  xmlRegPrintAtom(output, ctxt->atom);
1224  }
1225  fprintf(output, "%d states:", ctxt->nbStates);
1226  if (ctxt->start != NULL)
1227  fprintf(output, " start: %d", ctxt->start->no);
1228  if (ctxt->end != NULL)
1229  fprintf(output, " end: %d", ctxt->end->no);
1230  fprintf(output, "\n");
1231  for (i = 0;i < ctxt->nbStates; i++) {
1232  xmlRegPrintState(output, ctxt->states[i]);
1233  }
1234  fprintf(output, "%d counters:\n", ctxt->nbCounters);
1235  for (i = 0;i < ctxt->nbCounters; i++) {
1236  fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1237  ctxt->counters[i].max);
1238  }
1239 }
1240 #endif
1241 
1242 /************************************************************************
1243  * *
1244  * Finite Automata structures manipulations *
1245  * *
1246  ************************************************************************/
1247 
1248 static void
1249 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1250  int neg, xmlRegAtomType type, int start, int end,
1251  xmlChar *blockName) {
1252  xmlRegRangePtr range;
1253 
1254  if (atom == NULL) {
1255  ERROR("add range: atom is NULL");
1256  return;
1257  }
1258  if (atom->type != XML_REGEXP_RANGES) {
1259  ERROR("add range: atom is not ranges");
1260  return;
1261  }
1262  if (atom->maxRanges == 0) {
1263  atom->maxRanges = 4;
1264  atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1265  sizeof(xmlRegRangePtr));
1266  if (atom->ranges == NULL) {
1267  xmlRegexpErrMemory(ctxt, "adding ranges");
1268  atom->maxRanges = 0;
1269  return;
1270  }
1271  } else if (atom->nbRanges >= atom->maxRanges) {
1272  xmlRegRangePtr *tmp;
1273  atom->maxRanges *= 2;
1274  tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1275  sizeof(xmlRegRangePtr));
1276  if (tmp == NULL) {
1277  xmlRegexpErrMemory(ctxt, "adding ranges");
1278  atom->maxRanges /= 2;
1279  return;
1280  }
1281  atom->ranges = tmp;
1282  }
1283  range = xmlRegNewRange(ctxt, neg, type, start, end);
1284  if (range == NULL)
1285  return;
1286  range->blockName = blockName;
1287  atom->ranges[atom->nbRanges++] = range;
1288 
1289 }
1290 
1291 static int
1292 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1293  if (ctxt->maxCounters == 0) {
1294  ctxt->maxCounters = 4;
1295  ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1296  sizeof(xmlRegCounter));
1297  if (ctxt->counters == NULL) {
1298  xmlRegexpErrMemory(ctxt, "allocating counter");
1299  ctxt->maxCounters = 0;
1300  return(-1);
1301  }
1302  } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1303  xmlRegCounter *tmp;
1304  ctxt->maxCounters *= 2;
1305  tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1306  sizeof(xmlRegCounter));
1307  if (tmp == NULL) {
1308  xmlRegexpErrMemory(ctxt, "allocating counter");
1309  ctxt->maxCounters /= 2;
1310  return(-1);
1311  }
1312  ctxt->counters = tmp;
1313  }
1314  ctxt->counters[ctxt->nbCounters].min = -1;
1315  ctxt->counters[ctxt->nbCounters].max = -1;
1316  return(ctxt->nbCounters++);
1317 }
1318 
1319 static int
1320 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1321  if (atom == NULL) {
1322  ERROR("atom push: atom is NULL");
1323  return(-1);
1324  }
1325  if (ctxt->maxAtoms == 0) {
1326  ctxt->maxAtoms = 4;
1327  ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1328  sizeof(xmlRegAtomPtr));
1329  if (ctxt->atoms == NULL) {
1330  xmlRegexpErrMemory(ctxt, "pushing atom");
1331  ctxt->maxAtoms = 0;
1332  return(-1);
1333  }
1334  } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1335  xmlRegAtomPtr *tmp;
1336  ctxt->maxAtoms *= 2;
1337  tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1338  sizeof(xmlRegAtomPtr));
1339  if (tmp == NULL) {
1340  xmlRegexpErrMemory(ctxt, "allocating counter");
1341  ctxt->maxAtoms /= 2;
1342  return(-1);
1343  }
1344  ctxt->atoms = tmp;
1345  }
1346  atom->no = ctxt->nbAtoms;
1347  ctxt->atoms[ctxt->nbAtoms++] = atom;
1348  return(0);
1349 }
1350 
1351 static void
1352 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1353  int from) {
1354  if (target->maxTransTo == 0) {
1355  target->maxTransTo = 8;
1356  target->transTo = (int *) xmlMalloc(target->maxTransTo *
1357  sizeof(int));
1358  if (target->transTo == NULL) {
1359  xmlRegexpErrMemory(ctxt, "adding transition");
1360  target->maxTransTo = 0;
1361  return;
1362  }
1363  } else if (target->nbTransTo >= target->maxTransTo) {
1364  int *tmp;
1365  target->maxTransTo *= 2;
1366  tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1367  sizeof(int));
1368  if (tmp == NULL) {
1369  xmlRegexpErrMemory(ctxt, "adding transition");
1370  target->maxTransTo /= 2;
1371  return;
1372  }
1373  target->transTo = tmp;
1374  }
1375  target->transTo[target->nbTransTo] = from;
1376  target->nbTransTo++;
1377 }
1378 
1379 static void
1380 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1381  xmlRegAtomPtr atom, xmlRegStatePtr target,
1382  int counter, int count) {
1383 
1384  int nrtrans;
1385 
1386  if (state == NULL) {
1387  ERROR("add state: state is NULL");
1388  return;
1389  }
1390  if (target == NULL) {
1391  ERROR("add state: target is NULL");
1392  return;
1393  }
1394  /*
1395  * Other routines follow the philosophy 'When in doubt, add a transition'
1396  * so we check here whether such a transition is already present and, if
1397  * so, silently ignore this request.
1398  */
1399 
1400  for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1401  xmlRegTransPtr trans = &(state->trans[nrtrans]);
1402  if ((trans->atom == atom) &&
1403  (trans->to == target->no) &&
1404  (trans->counter == counter) &&
1405  (trans->count == count)) {
1406 #ifdef DEBUG_REGEXP_GRAPH
1407  printf("Ignoring duplicate transition from %d to %d\n",
1408  state->no, target->no);
1409 #endif
1410  return;
1411  }
1412  }
1413 
1414  if (state->maxTrans == 0) {
1415  state->maxTrans = 8;
1416  state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1417  sizeof(xmlRegTrans));
1418  if (state->trans == NULL) {
1419  xmlRegexpErrMemory(ctxt, "adding transition");
1420  state->maxTrans = 0;
1421  return;
1422  }
1423  } else if (state->nbTrans >= state->maxTrans) {
1424  xmlRegTrans *tmp;
1425  state->maxTrans *= 2;
1426  tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1427  sizeof(xmlRegTrans));
1428  if (tmp == NULL) {
1429  xmlRegexpErrMemory(ctxt, "adding transition");
1430  state->maxTrans /= 2;
1431  return;
1432  }
1433  state->trans = tmp;
1434  }
1435 #ifdef DEBUG_REGEXP_GRAPH
1436  printf("Add trans from %d to %d ", state->no, target->no);
1437  if (count == REGEXP_ALL_COUNTER)
1438  printf("all transition\n");
1439  else if (count >= 0)
1440  printf("count based %d\n", count);
1441  else if (counter >= 0)
1442  printf("counted %d\n", counter);
1443  else if (atom == NULL)
1444  printf("epsilon transition\n");
1445  else if (atom != NULL)
1446  xmlRegPrintAtom(stdout, atom);
1447 #endif
1448 
1449  state->trans[state->nbTrans].atom = atom;
1450  state->trans[state->nbTrans].to = target->no;
1451  state->trans[state->nbTrans].counter = counter;
1452  state->trans[state->nbTrans].count = count;
1453  state->trans[state->nbTrans].nd = 0;
1454  state->nbTrans++;
1455  xmlRegStateAddTransTo(ctxt, target, state->no);
1456 }
1457 
1458 static int
1459 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1460  if (state == NULL) return(-1);
1461  if (ctxt->maxStates == 0) {
1462  ctxt->maxStates = 4;
1463  ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1464  sizeof(xmlRegStatePtr));
1465  if (ctxt->states == NULL) {
1466  xmlRegexpErrMemory(ctxt, "adding state");
1467  ctxt->maxStates = 0;
1468  return(-1);
1469  }
1470  } else if (ctxt->nbStates >= ctxt->maxStates) {
1471  xmlRegStatePtr *tmp;
1472  ctxt->maxStates *= 2;
1473  tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1474  sizeof(xmlRegStatePtr));
1475  if (tmp == NULL) {
1476  xmlRegexpErrMemory(ctxt, "adding state");
1477  ctxt->maxStates /= 2;
1478  return(-1);
1479  }
1480  ctxt->states = tmp;
1481  }
1482  state->no = ctxt->nbStates;
1483  ctxt->states[ctxt->nbStates++] = state;
1484  return(0);
1485 }
1486 
1495 static void
1496 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1497  xmlRegStatePtr from, xmlRegStatePtr to,
1498  int lax) {
1499  if (to == NULL) {
1500  to = xmlRegNewState(ctxt);
1501  xmlRegStatePush(ctxt, to);
1502  ctxt->state = to;
1503  }
1504  if (lax)
1505  xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1506  else
1507  xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1508 }
1509 
1517 static void
1518 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1519  xmlRegStatePtr from, xmlRegStatePtr to) {
1520  if (to == NULL) {
1521  to = xmlRegNewState(ctxt);
1522  xmlRegStatePush(ctxt, to);
1523  ctxt->state = to;
1524  }
1525  xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1526 }
1527 
1536 static void
1537 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1538  xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1539  if (to == NULL) {
1540  to = xmlRegNewState(ctxt);
1541  xmlRegStatePush(ctxt, to);
1542  ctxt->state = to;
1543  }
1544  xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1545 }
1546 
1555 static void
1556 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1557  xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1558  if (to == NULL) {
1559  to = xmlRegNewState(ctxt);
1560  xmlRegStatePush(ctxt, to);
1561  ctxt->state = to;
1562  }
1563  xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1564 }
1565 
1575 static int
1576 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1577  xmlRegStatePtr to, xmlRegAtomPtr atom) {
1578  xmlRegStatePtr end;
1579  int nullable = 0;
1580 
1581  if (atom == NULL) {
1582  ERROR("generate transition: atom == NULL");
1583  return(-1);
1584  }
1585  if (atom->type == XML_REGEXP_SUBREG) {
1586  /*
1587  * this is a subexpression handling one should not need to
1588  * create a new node except for XML_REGEXP_QUANT_RANGE.
1589  */
1590  if (xmlRegAtomPush(ctxt, atom) < 0) {
1591  return(-1);
1592  }
1593  if ((to != NULL) && (atom->stop != to) &&
1594  (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1595  /*
1596  * Generate an epsilon transition to link to the target
1597  */
1598  xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1599 #ifdef DV
1600  } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1601  (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1602  to = xmlRegNewState(ctxt);
1603  xmlRegStatePush(ctxt, to);
1604  ctxt->state = to;
1605  xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1606 #endif
1607  }
1608  switch (atom->quant) {
1609  case XML_REGEXP_QUANT_OPT:
1610  atom->quant = XML_REGEXP_QUANT_ONCE;
1611  /*
1612  * transition done to the state after end of atom.
1613  * 1. set transition from atom start to new state
1614  * 2. set transition from atom end to this state.
1615  */
1616  if (to == NULL) {
1617  xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1618  xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1619  ctxt->state);
1620  } else {
1621  xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1622  }
1623  break;
1624  case XML_REGEXP_QUANT_MULT:
1625  atom->quant = XML_REGEXP_QUANT_ONCE;
1626  xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1627  xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1628  break;
1629  case XML_REGEXP_QUANT_PLUS:
1630  atom->quant = XML_REGEXP_QUANT_ONCE;
1631  xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1632  break;
1633  case XML_REGEXP_QUANT_RANGE: {
1634  int counter;
1635  xmlRegStatePtr inter, newstate;
1636 
1637  /*
1638  * create the final state now if needed
1639  */
1640  if (to != NULL) {
1641  newstate = to;
1642  } else {
1643  newstate = xmlRegNewState(ctxt);
1644  xmlRegStatePush(ctxt, newstate);
1645  }
1646 
1647  /*
1648  * The principle here is to use counted transition
1649  * to avoid explosion in the number of states in the
1650  * graph. This is clearly more complex but should not
1651  * be exploitable at runtime.
1652  */
1653  if ((atom->min == 0) && (atom->start0 == NULL)) {
1654  xmlRegAtomPtr copy;
1655  /*
1656  * duplicate a transition based on atom to count next
1657  * occurrences after 1. We cannot loop to atom->start
1658  * directly because we need an epsilon transition to
1659  * newstate.
1660  */
1661  /* ???? For some reason it seems we never reach that
1662  case, I suppose this got optimized out before when
1663  building the automata */
1664  copy = xmlRegCopyAtom(ctxt, atom);
1665  if (copy == NULL)
1666  return(-1);
1667  copy->quant = XML_REGEXP_QUANT_ONCE;
1668  copy->min = 0;
1669  copy->max = 0;
1670 
1671  if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1672  < 0)
1673  return(-1);
1674  inter = ctxt->state;
1675  counter = xmlRegGetCounter(ctxt);
1676  ctxt->counters[counter].min = atom->min - 1;
1677  ctxt->counters[counter].max = atom->max - 1;
1678  /* count the number of times we see it again */
1679  xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1680  atom->stop, counter);
1681  /* allow a way out based on the count */
1682  xmlFAGenerateCountedTransition(ctxt, inter,
1683  newstate, counter);
1684  /* and also allow a direct exit for 0 */
1685  xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1686  newstate);
1687  } else {
1688  /*
1689  * either we need the atom at least once or there
1690  * is an atom->start0 allowing to easily plug the
1691  * epsilon transition.
1692  */
1693  counter = xmlRegGetCounter(ctxt);
1694  ctxt->counters[counter].min = atom->min - 1;
1695  ctxt->counters[counter].max = atom->max - 1;
1696  /* count the number of times we see it again */
1697  xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1698  atom->start, counter);
1699  /* allow a way out based on the count */
1700  xmlFAGenerateCountedTransition(ctxt, atom->stop,
1701  newstate, counter);
1702  /* and if needed allow a direct exit for 0 */
1703  if (atom->min == 0)
1704  xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1705  newstate);
1706 
1707  }
1708  atom->min = 0;
1709  atom->max = 0;
1710  atom->quant = XML_REGEXP_QUANT_ONCE;
1711  ctxt->state = newstate;
1712  }
1713  default:
1714  break;
1715  }
1716  return(0);
1717  }
1718  if ((atom->min == 0) && (atom->max == 0) &&
1719  (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1720  /*
1721  * we can discard the atom and generate an epsilon transition instead
1722  */
1723  if (to == NULL) {
1724  to = xmlRegNewState(ctxt);
1725  if (to != NULL)
1726  xmlRegStatePush(ctxt, to);
1727  else {
1728  return(-1);
1729  }
1730  }
1731  xmlFAGenerateEpsilonTransition(ctxt, from, to);
1732  ctxt->state = to;
1733  xmlRegFreeAtom(atom);
1734  return(0);
1735  }
1736  if (to == NULL) {
1737  to = xmlRegNewState(ctxt);
1738  if (to != NULL)
1739  xmlRegStatePush(ctxt, to);
1740  else {
1741  return(-1);
1742  }
1743  }
1744  end = to;
1745  if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1746  (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1747  /*
1748  * Do not pollute the target state by adding transitions from
1749  * it as it is likely to be the shared target of multiple branches.
1750  * So isolate with an epsilon transition.
1751  */
1752  xmlRegStatePtr tmp;
1753 
1754  tmp = xmlRegNewState(ctxt);
1755  if (tmp != NULL)
1756  xmlRegStatePush(ctxt, tmp);
1757  else {
1758  return(-1);
1759  }
1760  xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1761  to = tmp;
1762  }
1763  if (xmlRegAtomPush(ctxt, atom) < 0) {
1764  return(-1);
1765  }
1766  if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1767  (atom->min == 0) && (atom->max > 0)) {
1768  nullable = 1;
1769  atom->min = 1;
1770  if (atom->max == 1)
1771  atom->quant = XML_REGEXP_QUANT_OPT;
1772  }
1773  xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1774  ctxt->state = end;
1775  switch (atom->quant) {
1776  case XML_REGEXP_QUANT_OPT:
1777  atom->quant = XML_REGEXP_QUANT_ONCE;
1778  xmlFAGenerateEpsilonTransition(ctxt, from, to);
1779  break;
1780  case XML_REGEXP_QUANT_MULT:
1781  atom->quant = XML_REGEXP_QUANT_ONCE;
1782  xmlFAGenerateEpsilonTransition(ctxt, from, to);
1783  xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1784  break;
1785  case XML_REGEXP_QUANT_PLUS:
1786  atom->quant = XML_REGEXP_QUANT_ONCE;
1787  xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1788  break;
1789  case XML_REGEXP_QUANT_RANGE:
1790  if (nullable)
1791  xmlFAGenerateEpsilonTransition(ctxt, from, to);
1792  break;
1793  default:
1794  break;
1795  }
1796  return(0);
1797 }
1798 
1807 static void
1808 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1809  int tonr, int counter) {
1810  int transnr;
1811  xmlRegStatePtr from;
1812  xmlRegStatePtr to;
1813 
1814 #ifdef DEBUG_REGEXP_GRAPH
1815  printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1816 #endif
1817  from = ctxt->states[fromnr];
1818  if (from == NULL)
1819  return;
1820  to = ctxt->states[tonr];
1821  if (to == NULL)
1822  return;
1823  if ((to->mark == XML_REGEXP_MARK_START) ||
1824  (to->mark == XML_REGEXP_MARK_VISITED))
1825  return;
1826 
1827  to->mark = XML_REGEXP_MARK_VISITED;
1828  if (to->type == XML_REGEXP_FINAL_STATE) {
1829 #ifdef DEBUG_REGEXP_GRAPH
1830  printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1831 #endif
1832  from->type = XML_REGEXP_FINAL_STATE;
1833  }
1834  for (transnr = 0;transnr < to->nbTrans;transnr++) {
1835  if (to->trans[transnr].to < 0)
1836  continue;
1837  if (to->trans[transnr].atom == NULL) {
1838  /*
1839  * Don't remove counted transitions
1840  * Don't loop either
1841  */
1842  if (to->trans[transnr].to != fromnr) {
1843  if (to->trans[transnr].count >= 0) {
1844  int newto = to->trans[transnr].to;
1845 
1846  xmlRegStateAddTrans(ctxt, from, NULL,
1847  ctxt->states[newto],
1848  -1, to->trans[transnr].count);
1849  } else {
1850 #ifdef DEBUG_REGEXP_GRAPH
1851  printf("Found epsilon trans %d from %d to %d\n",
1852  transnr, tonr, to->trans[transnr].to);
1853 #endif
1854  if (to->trans[transnr].counter >= 0) {
1855  xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1856  to->trans[transnr].to,
1857  to->trans[transnr].counter);
1858  } else {
1859  xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1860  to->trans[transnr].to,
1861  counter);
1862  }
1863  }
1864  }
1865  } else {
1866  int newto = to->trans[transnr].to;
1867 
1868  if (to->trans[transnr].counter >= 0) {
1869  xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1870  ctxt->states[newto],
1871  to->trans[transnr].counter, -1);
1872  } else {
1873  xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1874  ctxt->states[newto], counter, -1);
1875  }
1876  }
1877  }
1878  to->mark = XML_REGEXP_MARK_NORMAL;
1879 }
1880 
1896 static void
1897 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1898  int statenr, i, j, newto;
1899  xmlRegStatePtr state, tmp;
1900 
1901  for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1902  state = ctxt->states[statenr];
1903  if (state == NULL)
1904  continue;
1905  if (state->nbTrans != 1)
1906  continue;
1907  if (state->type == XML_REGEXP_UNREACH_STATE)
1908  continue;
1909  /* is the only transition out a basic transition */
1910  if ((state->trans[0].atom == NULL) &&
1911  (state->trans[0].to >= 0) &&
1912  (state->trans[0].to != statenr) &&
1913  (state->trans[0].counter < 0) &&
1914  (state->trans[0].count < 0)) {
1915  newto = state->trans[0].to;
1916 
1917  if (state->type == XML_REGEXP_START_STATE) {
1918 #ifdef DEBUG_REGEXP_GRAPH
1919  printf("Found simple epsilon trans from start %d to %d\n",
1920  statenr, newto);
1921 #endif
1922  } else {
1923 #ifdef DEBUG_REGEXP_GRAPH
1924  printf("Found simple epsilon trans from %d to %d\n",
1925  statenr, newto);
1926 #endif
1927  for (i = 0;i < state->nbTransTo;i++) {
1928  tmp = ctxt->states[state->transTo[i]];
1929  for (j = 0;j < tmp->nbTrans;j++) {
1930  if (tmp->trans[j].to == statenr) {
1931 #ifdef DEBUG_REGEXP_GRAPH
1932  printf("Changed transition %d on %d to go to %d\n",
1933  j, tmp->no, newto);
1934 #endif
1935  tmp->trans[j].to = -1;
1936  xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1937  ctxt->states[newto],
1938  tmp->trans[j].counter,
1939  tmp->trans[j].count);
1940  }
1941  }
1942  }
1943  if (state->type == XML_REGEXP_FINAL_STATE)
1944  ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1945  /* eliminate the transition completely */
1946  state->nbTrans = 0;
1947 
1948  state->type = XML_REGEXP_UNREACH_STATE;
1949 
1950  }
1951 
1952  }
1953  }
1954 }
1960 static void
1961 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1962  int statenr, transnr;
1963  xmlRegStatePtr state;
1964  int has_epsilon;
1965 
1966  if (ctxt->states == NULL) return;
1967 
1968  /*
1969  * Eliminate simple epsilon transition and the associated unreachable
1970  * states.
1971  */
1972  xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1973  for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1974  state = ctxt->states[statenr];
1975  if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1976 #ifdef DEBUG_REGEXP_GRAPH
1977  printf("Removed unreachable state %d\n", statenr);
1978 #endif
1979  xmlRegFreeState(state);
1980  ctxt->states[statenr] = NULL;
1981  }
1982  }
1983 
1984  has_epsilon = 0;
1985 
1986  /*
1987  * Build the completed transitions bypassing the epsilons
1988  * Use a marking algorithm to avoid loops
1989  * Mark sink states too.
1990  * Process from the latest states backward to the start when
1991  * there is long cascading epsilon chains this minimize the
1992  * recursions and transition compares when adding the new ones
1993  */
1994  for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1995  state = ctxt->states[statenr];
1996  if (state == NULL)
1997  continue;
1998  if ((state->nbTrans == 0) &&
1999  (state->type != XML_REGEXP_FINAL_STATE)) {
2000  state->type = XML_REGEXP_SINK_STATE;
2001  }
2002  for (transnr = 0;transnr < state->nbTrans;transnr++) {
2003  if ((state->trans[transnr].atom == NULL) &&
2004  (state->trans[transnr].to >= 0)) {
2005  if (state->trans[transnr].to == statenr) {
2006  state->trans[transnr].to = -1;
2007 #ifdef DEBUG_REGEXP_GRAPH
2008  printf("Removed loopback epsilon trans %d on %d\n",
2009  transnr, statenr);
2010 #endif
2011  } else if (state->trans[transnr].count < 0) {
2012  int newto = state->trans[transnr].to;
2013 
2014 #ifdef DEBUG_REGEXP_GRAPH
2015  printf("Found epsilon trans %d from %d to %d\n",
2016  transnr, statenr, newto);
2017 #endif
2018  has_epsilon = 1;
2019  state->trans[transnr].to = -2;
2020  state->mark = XML_REGEXP_MARK_START;
2021  xmlFAReduceEpsilonTransitions(ctxt, statenr,
2022  newto, state->trans[transnr].counter);
2023  state->mark = XML_REGEXP_MARK_NORMAL;
2024 #ifdef DEBUG_REGEXP_GRAPH
2025  } else {
2026  printf("Found counted transition %d on %d\n",
2027  transnr, statenr);
2028 #endif
2029  }
2030  }
2031  }
2032  }
2033  /*
2034  * Eliminate the epsilon transitions
2035  */
2036  if (has_epsilon) {
2037  for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2038  state = ctxt->states[statenr];
2039  if (state == NULL)
2040  continue;
2041  for (transnr = 0;transnr < state->nbTrans;transnr++) {
2042  xmlRegTransPtr trans = &(state->trans[transnr]);
2043  if ((trans->atom == NULL) &&
2044  (trans->count < 0) &&
2045  (trans->to >= 0)) {
2046  trans->to = -1;
2047  }
2048  }
2049  }
2050  }
2051 
2052  /*
2053  * Use this pass to detect unreachable states too
2054  */
2055  for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2056  state = ctxt->states[statenr];
2057  if (state != NULL)
2058  state->reached = XML_REGEXP_MARK_NORMAL;
2059  }
2060  state = ctxt->states[0];
2061  if (state != NULL)
2062  state->reached = XML_REGEXP_MARK_START;
2063  while (state != NULL) {
2064  xmlRegStatePtr target = NULL;
2065  state->reached = XML_REGEXP_MARK_VISITED;
2066  /*
2067  * Mark all states reachable from the current reachable state
2068  */
2069  for (transnr = 0;transnr < state->nbTrans;transnr++) {
2070  if ((state->trans[transnr].to >= 0) &&
2071  ((state->trans[transnr].atom != NULL) ||
2072  (state->trans[transnr].count >= 0))) {
2073  int newto = state->trans[transnr].to;
2074 
2075  if (ctxt->states[newto] == NULL)
2076  continue;
2077  if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2078  ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
2079  target = ctxt->states[newto];
2080  }
2081  }
2082  }
2083 
2084  /*
2085  * find the next accessible state not explored
2086  */
2087  if (target == NULL) {
2088  for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2089  state = ctxt->states[statenr];
2090  if ((state != NULL) && (state->reached ==
2091  XML_REGEXP_MARK_START)) {
2092  target = state;
2093  break;
2094  }
2095  }
2096  }
2097  state = target;
2098  }
2099  for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2100  state = ctxt->states[statenr];
2101  if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2102 #ifdef DEBUG_REGEXP_GRAPH
2103  printf("Removed unreachable state %d\n", statenr);
2104 #endif
2105  xmlRegFreeState(state);
2106  ctxt->states[statenr] = NULL;
2107  }
2108  }
2109 
2110 }
2111 
2112 static int
2113 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2114  int ret = 0;
2115 
2116  if ((range1->type == XML_REGEXP_RANGES) ||
2117  (range2->type == XML_REGEXP_RANGES) ||
2118  (range2->type == XML_REGEXP_SUBREG) ||
2119  (range1->type == XML_REGEXP_SUBREG) ||
2120  (range1->type == XML_REGEXP_STRING) ||
2121  (range2->type == XML_REGEXP_STRING))
2122  return(-1);
2123 
2124  /* put them in order */
2125  if (range1->type > range2->type) {
2126  xmlRegRangePtr tmp;
2127 
2128  tmp = range1;
2129  range1 = range2;
2130  range2 = tmp;
2131  }
2132  if ((range1->type == XML_REGEXP_ANYCHAR) ||
2133  (range2->type == XML_REGEXP_ANYCHAR)) {
2134  ret = 1;
2135  } else if ((range1->type == XML_REGEXP_EPSILON) ||
2136  (range2->type == XML_REGEXP_EPSILON)) {
2137  return(0);
2138  } else if (range1->type == range2->type) {
2139  if (range1->type != XML_REGEXP_CHARVAL)
2140  ret = 1;
2141  else if ((range1->end < range2->start) ||
2142  (range2->end < range1->start))
2143  ret = 0;
2144  else
2145  ret = 1;
2146  } else if (range1->type == XML_REGEXP_CHARVAL) {
2147  int codepoint;
2148  int neg = 0;
2149 
2150  /*
2151  * just check all codepoints in the range for acceptance,
2152  * this is usually way cheaper since done only once at
2153  * compilation than testing over and over at runtime or
2154  * pushing too many states when evaluating.
2155  */
2156  if (((range1->neg == 0) && (range2->neg != 0)) ||
2157  ((range1->neg != 0) && (range2->neg == 0)))
2158  neg = 1;
2159 
2160  for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2161  ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2162  0, range2->start, range2->end,
2163  range2->blockName);
2164  if (ret < 0)
2165  return(-1);
2166  if (((neg == 1) && (ret == 0)) ||
2167  ((neg == 0) && (ret == 1)))
2168  return(1);
2169  }
2170  return(0);
2171  } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2172  (range2->type == XML_REGEXP_BLOCK_NAME)) {
2173  if (range1->type == range2->type) {
2174  ret = xmlStrEqual(range1->blockName, range2->blockName);
2175  } else {
2176  /*
2177  * comparing a block range with anything else is way
2178  * too costly, and maintaining the table is like too much
2179  * memory too, so let's force the automata to save state
2180  * here.
2181  */
2182  return(1);
2183  }
2184  } else if ((range1->type < XML_REGEXP_LETTER) ||
2185  (range2->type < XML_REGEXP_LETTER)) {
2186  if ((range1->type == XML_REGEXP_ANYSPACE) &&
2187  (range2->type == XML_REGEXP_NOTSPACE))
2188  ret = 0;
2189  else if ((range1->type == XML_REGEXP_INITNAME) &&
2190  (range2->type == XML_REGEXP_NOTINITNAME))
2191  ret = 0;
2192  else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2193  (range2->type == XML_REGEXP_NOTNAMECHAR))
2194  ret = 0;
2195  else if ((range1->type == XML_REGEXP_DECIMAL) &&
2196  (range2->type == XML_REGEXP_NOTDECIMAL))
2197  ret = 0;
2198  else if ((range1->type == XML_REGEXP_REALCHAR) &&
2199  (range2->type == XML_REGEXP_NOTREALCHAR))
2200  ret = 0;
2201  else {
2202  /* same thing to limit complexity */
2203  return(1);
2204  }
2205  } else {
2206  ret = 0;
2207  /* range1->type < range2->type here */
2208  switch (range1->type) {
2209  case XML_REGEXP_LETTER:
2210  /* all disjoint except in the subgroups */
2211  if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2212  (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2213  (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2214  (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2215  (range2->type == XML_REGEXP_LETTER_OTHERS))
2216  ret = 1;
2217  break;
2218  case XML_REGEXP_MARK:
2219  if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2220  (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2221  (range2->type == XML_REGEXP_MARK_ENCLOSING))
2222  ret = 1;
2223  break;
2224  case XML_REGEXP_NUMBER:
2225  if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2226  (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2227  (range2->type == XML_REGEXP_NUMBER_OTHERS))
2228  ret = 1;
2229  break;
2230  case XML_REGEXP_PUNCT:
2231  if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2232  (range2->type == XML_REGEXP_PUNCT_DASH) ||
2233  (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2234  (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2235  (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2236  (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2237  (range2->type == XML_REGEXP_PUNCT_OTHERS))
2238  ret = 1;
2239  break;
2240  case XML_REGEXP_SEPAR:
2241  if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2242  (range2->type == XML_REGEXP_SEPAR_LINE) ||
2243  (range2->type == XML_REGEXP_SEPAR_PARA))
2244  ret = 1;
2245  break;
2246  case XML_REGEXP_SYMBOL:
2247  if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2248  (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2249  (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2250  (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2251  ret = 1;
2252  break;
2253  case XML_REGEXP_OTHER:
2254  if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2255  (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2256  (range2->type == XML_REGEXP_OTHER_PRIVATE))
2257  ret = 1;
2258  break;
2259  default:
2260  if ((range2->type >= XML_REGEXP_LETTER) &&
2261  (range2->type < XML_REGEXP_BLOCK_NAME))
2262  ret = 0;
2263  else {
2264  /* safety net ! */
2265  return(1);
2266  }
2267  }
2268  }
2269  if (((range1->neg == 0) && (range2->neg != 0)) ||
2270  ((range1->neg != 0) && (range2->neg == 0)))
2271  ret = !ret;
2272  return(ret);
2273 }
2274 
2285 static int
2286 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2287  if ((type1 == XML_REGEXP_EPSILON) ||
2288  (type1 == XML_REGEXP_CHARVAL) ||
2289  (type1 == XML_REGEXP_RANGES) ||
2290  (type1 == XML_REGEXP_SUBREG) ||
2291  (type1 == XML_REGEXP_STRING) ||
2292  (type1 == XML_REGEXP_ANYCHAR))
2293  return(1);
2294  if ((type2 == XML_REGEXP_EPSILON) ||
2295  (type2 == XML_REGEXP_CHARVAL) ||
2296  (type2 == XML_REGEXP_RANGES) ||
2297  (type2 == XML_REGEXP_SUBREG) ||
2298  (type2 == XML_REGEXP_STRING) ||
2299  (type2 == XML_REGEXP_ANYCHAR))
2300  return(1);
2301 
2302  if (type1 == type2) return(1);
2303 
2304  /* simplify subsequent compares by making sure type1 < type2 */
2305  if (type1 > type2) {
2306  xmlRegAtomType tmp = type1;
2307  type1 = type2;
2308  type2 = tmp;
2309  }
2310  switch (type1) {
2311  case XML_REGEXP_ANYSPACE: /* \s */
2312  /* can't be a letter, number, mark, punctuation, symbol */
2313  if ((type2 == XML_REGEXP_NOTSPACE) ||
2314  ((type2 >= XML_REGEXP_LETTER) &&
2315  (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2316  ((type2 >= XML_REGEXP_NUMBER) &&
2317  (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2318  ((type2 >= XML_REGEXP_MARK) &&
2319  (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2320  ((type2 >= XML_REGEXP_PUNCT) &&
2321  (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2322  ((type2 >= XML_REGEXP_SYMBOL) &&
2323  (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2324  ) return(0);
2325  break;
2326  case XML_REGEXP_NOTSPACE: /* \S */
2327  break;
2328  case XML_REGEXP_INITNAME: /* \l */
2329  /* can't be a number, mark, separator, punctuation, symbol or other */
2330  if ((type2 == XML_REGEXP_NOTINITNAME) ||
2331  ((type2 >= XML_REGEXP_NUMBER) &&
2332  (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2333  ((type2 >= XML_REGEXP_MARK) &&
2334  (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2335  ((type2 >= XML_REGEXP_SEPAR) &&
2336  (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2337  ((type2 >= XML_REGEXP_PUNCT) &&
2338  (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2339  ((type2 >= XML_REGEXP_SYMBOL) &&
2340  (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2341  ((type2 >= XML_REGEXP_OTHER) &&
2342  (type2 <= XML_REGEXP_OTHER_NA))
2343  ) return(0);
2344  break;
2345  case XML_REGEXP_NOTINITNAME: /* \L */
2346  break;
2347  case XML_REGEXP_NAMECHAR: /* \c */
2348  /* can't be a mark, separator, punctuation, symbol or other */
2349  if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2350  ((type2 >= XML_REGEXP_MARK) &&
2351  (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2352  ((type2 >= XML_REGEXP_PUNCT) &&
2353  (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2354  ((type2 >= XML_REGEXP_SEPAR) &&
2355  (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2356  ((type2 >= XML_REGEXP_SYMBOL) &&
2357  (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2358  ((type2 >= XML_REGEXP_OTHER) &&
2359  (type2 <= XML_REGEXP_OTHER_NA))
2360  ) return(0);
2361  break;
2362  case XML_REGEXP_NOTNAMECHAR: /* \C */
2363  break;
2364  case XML_REGEXP_DECIMAL: /* \d */
2365  /* can't be a letter, mark, separator, punctuation, symbol or other */
2366  if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2367  (type2 == XML_REGEXP_REALCHAR) ||
2368  ((type2 >= XML_REGEXP_LETTER) &&
2369  (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2370  ((type2 >= XML_REGEXP_MARK) &&
2371  (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2372  ((type2 >= XML_REGEXP_PUNCT) &&
2373  (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2374  ((type2 >= XML_REGEXP_SEPAR) &&
2375  (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2376  ((type2 >= XML_REGEXP_SYMBOL) &&
2377  (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2378  ((type2 >= XML_REGEXP_OTHER) &&
2379  (type2 <= XML_REGEXP_OTHER_NA))
2380  )return(0);
2381  break;
2382  case XML_REGEXP_NOTDECIMAL: /* \D */
2383  break;
2384  case XML_REGEXP_REALCHAR: /* \w */
2385  /* can't be a mark, separator, punctuation, symbol or other */
2386  if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2387  ((type2 >= XML_REGEXP_MARK) &&
2388  (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2389  ((type2 >= XML_REGEXP_PUNCT) &&
2390  (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2391  ((type2 >= XML_REGEXP_SEPAR) &&
2392  (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2393  ((type2 >= XML_REGEXP_SYMBOL) &&
2394  (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2395  ((type2 >= XML_REGEXP_OTHER) &&
2396  (type2 <= XML_REGEXP_OTHER_NA))
2397  )return(0);
2398  break;
2399  case XML_REGEXP_NOTREALCHAR: /* \W */
2400  break;
2401  /*
2402  * at that point we know both type 1 and type2 are from
2403  * character categories are ordered and are different,
2404  * it becomes simple because this is a partition
2405  */
2406  case XML_REGEXP_LETTER:
2407  if (type2 <= XML_REGEXP_LETTER_OTHERS)
2408  return(1);
2409  return(0);
2410  case XML_REGEXP_LETTER_UPPERCASE:
2411  case XML_REGEXP_LETTER_LOWERCASE:
2412  case XML_REGEXP_LETTER_TITLECASE:
2413  case XML_REGEXP_LETTER_MODIFIER:
2414  case XML_REGEXP_LETTER_OTHERS:
2415  return(0);
2416  case XML_REGEXP_MARK:
2417  if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2418  return(1);
2419  return(0);
2420  case XML_REGEXP_MARK_NONSPACING:
2421  case XML_REGEXP_MARK_SPACECOMBINING:
2422  case XML_REGEXP_MARK_ENCLOSING:
2423  return(0);
2424  case XML_REGEXP_NUMBER:
2425  if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2426  return(1);
2427  return(0);
2428  case XML_REGEXP_NUMBER_DECIMAL:
2429  case XML_REGEXP_NUMBER_LETTER:
2430  case XML_REGEXP_NUMBER_OTHERS:
2431  return(0);
2432  case XML_REGEXP_PUNCT:
2433  if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2434  return(1);
2435  return(0);
2436  case XML_REGEXP_PUNCT_CONNECTOR:
2437  case XML_REGEXP_PUNCT_DASH:
2438  case XML_REGEXP_PUNCT_OPEN:
2439  case XML_REGEXP_PUNCT_CLOSE:
2440  case XML_REGEXP_PUNCT_INITQUOTE:
2441  case XML_REGEXP_PUNCT_FINQUOTE:
2442  case XML_REGEXP_PUNCT_OTHERS:
2443  return(0);
2444  case XML_REGEXP_SEPAR:
2445  if (type2 <= XML_REGEXP_SEPAR_PARA)
2446  return(1);
2447  return(0);
2448  case XML_REGEXP_SEPAR_SPACE:
2449  case XML_REGEXP_SEPAR_LINE:
2450  case XML_REGEXP_SEPAR_PARA:
2451  return(0);
2452  case XML_REGEXP_SYMBOL:
2453  if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2454  return(1);
2455  return(0);
2456  case XML_REGEXP_SYMBOL_MATH:
2457  case XML_REGEXP_SYMBOL_CURRENCY:
2458  case XML_REGEXP_SYMBOL_MODIFIER:
2459  case XML_REGEXP_SYMBOL_OTHERS:
2460  return(0);
2461  case XML_REGEXP_OTHER:
2462  if (type2 <= XML_REGEXP_OTHER_NA)
2463  return(1);
2464  return(0);
2465  case XML_REGEXP_OTHER_CONTROL:
2466  case XML_REGEXP_OTHER_FORMAT:
2467  case XML_REGEXP_OTHER_PRIVATE:
2468  case XML_REGEXP_OTHER_NA:
2469  return(0);
2470  default:
2471  break;
2472  }
2473  return(1);
2474 }
2475 
2487 static int
2488 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2489  int ret = 0;
2490 
2491  if (atom1 == atom2)
2492  return(1);
2493  if ((atom1 == NULL) || (atom2 == NULL))
2494  return(0);
2495 
2496  if (atom1->type != atom2->type)
2497  return(0);
2498  switch (atom1->type) {
2499  case XML_REGEXP_EPSILON:
2500  ret = 0;
2501  break;
2502  case XML_REGEXP_STRING:
2503  if (!deep)
2504  ret = (atom1->valuep == atom2->valuep);
2505  else
2506  ret = xmlStrEqual((xmlChar *)atom1->valuep,
2507  (xmlChar *)atom2->valuep);
2508  break;
2509  case XML_REGEXP_CHARVAL:
2510  ret = (atom1->codepoint == atom2->codepoint);
2511  break;
2512  case XML_REGEXP_RANGES:
2513  /* too hard to do in the general case */
2514  ret = 0;
2515  default:
2516  break;
2517  }
2518  return(ret);
2519 }
2520 
2532 static int
2533 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2534  int ret = 1;
2535 
2536  if (atom1 == atom2)
2537  return(1);
2538  if ((atom1 == NULL) || (atom2 == NULL))
2539  return(0);
2540 
2541  if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2542  (atom2->type == XML_REGEXP_ANYCHAR))
2543  return(1);
2544 
2545  if (atom1->type > atom2->type) {
2546  xmlRegAtomPtr tmp;
2547  tmp = atom1;
2548  atom1 = atom2;
2549  atom2 = tmp;
2550  }
2551  if (atom1->type != atom2->type) {
2552  ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2553  /* if they can't intersect at the type level break now */
2554  if (ret == 0)
2555  return(0);
2556  }
2557  switch (atom1->type) {
2558  case XML_REGEXP_STRING:
2559  if (!deep)
2560  ret = (atom1->valuep != atom2->valuep);
2561  else {
2562  xmlChar *val1 = (xmlChar *)atom1->valuep;
2563  xmlChar *val2 = (xmlChar *)atom2->valuep;
2564  int compound1 = (xmlStrchr(val1, '|') != NULL);
2565  int compound2 = (xmlStrchr(val2, '|') != NULL);
2566 
2567  /* Ignore negative match flag for ##other namespaces */
2568  if (compound1 != compound2)
2569  return(0);
2570 
2571  ret = xmlRegStrEqualWildcard(val1, val2);
2572  }
2573  break;
2574  case XML_REGEXP_EPSILON:
2575  goto not_determinist;
2576  case XML_REGEXP_CHARVAL:
2577  if (atom2->type == XML_REGEXP_CHARVAL) {
2578  ret = (atom1->codepoint == atom2->codepoint);
2579  } else {
2580  ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2581  if (ret < 0)
2582  ret = 1;
2583  }
2584  break;
2585  case XML_REGEXP_RANGES:
2586  if (atom2->type == XML_REGEXP_RANGES) {
2587  int i, j, res;
2588  xmlRegRangePtr r1, r2;
2589 
2590  /*
2591  * need to check that none of the ranges eventually matches
2592  */
2593  for (i = 0;i < atom1->nbRanges;i++) {
2594  for (j = 0;j < atom2->nbRanges;j++) {
2595  r1 = atom1->ranges[i];
2596  r2 = atom2->ranges[j];
2597  res = xmlFACompareRanges(r1, r2);
2598  if (res == 1) {
2599  ret = 1;
2600  goto done;
2601  }
2602  }
2603  }
2604  ret = 0;
2605  }
2606  break;
2607  default:
2608  goto not_determinist;
2609  }
2610 done:
2611  if (atom1->neg != atom2->neg) {
2612  ret = !ret;
2613  }
2614  if (ret == 0)
2615  return(0);
2616 not_determinist:
2617  return(1);
2618 }
2619 
2628 static int
2629 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2630  int to, xmlRegAtomPtr atom) {
2631  int ret = 1;
2632  int res;
2633  int transnr, nbTrans;
2634  xmlRegTransPtr t1;
2635  int deep = 1;
2636 
2637  if (state == NULL)
2638  return(ret);
2639  if (state->markd == XML_REGEXP_MARK_VISITED)
2640  return(ret);
2641 
2642  if (ctxt->flags & AM_AUTOMATA_RNG)
2643  deep = 0;
2644 
2645  /*
2646  * don't recurse on transitions potentially added in the course of
2647  * the elimination.
2648  */
2649  nbTrans = state->nbTrans;
2650  for (transnr = 0;transnr < nbTrans;transnr++) {
2651  t1 = &(state->trans[transnr]);
2652  /*
2653  * check transitions conflicting with the one looked at
2654  */
2655  if (t1->atom == NULL) {
2656  if (t1->to < 0)
2657  continue;
2658  state->markd = XML_REGEXP_MARK_VISITED;
2659  res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2660  to, atom);
2661  if (res == 0) {
2662  ret = 0;
2663  /* t1->nd = 1; */
2664  }
2665  continue;
2666  }
2667  if (t1->to != to)
2668  continue;
2669  if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2670  ret = 0;
2671  /* mark the transition as non-deterministic */
2672  t1->nd = 1;
2673  }
2674  }
2675  return(ret);
2676 }
2677 
2684 static void
2685 xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
2686  int transnr, nbTrans;
2687 
2688  if (state == NULL)
2689  return;
2690  if (state->markd != XML_REGEXP_MARK_VISITED)
2691  return;
2692  state->markd = 0;
2693 
2694  nbTrans = state->nbTrans;
2695  for (transnr = 0; transnr < nbTrans; transnr++) {
2696  xmlRegTransPtr t1 = &state->trans[transnr];
2697  if ((t1->atom == NULL) && (t1->to >= 0))
2698  xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2699  }
2700 }
2701 
2710 static int
2711 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2712  int statenr, transnr;
2713  xmlRegStatePtr state;
2714  xmlRegTransPtr t1, t2, last;
2715  int i;
2716  int ret = 1;
2717  int deep = 1;
2718 
2719 #ifdef DEBUG_REGEXP_GRAPH
2720  printf("xmlFAComputesDeterminism\n");
2721  xmlRegPrintCtxt(stdout, ctxt);
2722 #endif
2723  if (ctxt->determinist != -1)
2724  return(ctxt->determinist);
2725 
2726  if (ctxt->flags & AM_AUTOMATA_RNG)
2727  deep = 0;
2728 
2729  /*
2730  * First cleanup the automata removing cancelled transitions
2731  */
2732  for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2733  state = ctxt->states[statenr];
2734  if (state == NULL)
2735  continue;
2736  if (state->nbTrans < 2)
2737  continue;
2738  for (transnr = 0;transnr < state->nbTrans;transnr++) {
2739  t1 = &(state->trans[transnr]);
2740  /*
2741  * Determinism checks in case of counted or all transitions
2742  * will have to be handled separately
2743  */
2744  if (t1->atom == NULL) {
2745  /* t1->nd = 1; */
2746  continue;
2747  }
2748  if (t1->to == -1) /* eliminated */
2749  continue;
2750  for (i = 0;i < transnr;i++) {
2751  t2 = &(state->trans[i]);
2752  if (t2->to == -1) /* eliminated */
2753  continue;
2754  if (t2->atom != NULL) {
2755  if (t1->to == t2->to) {
2756  /*
2757  * Here we use deep because we want to keep the
2758  * transitions which indicate a conflict
2759  */
2760  if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2761  (t1->counter == t2->counter) &&
2762  (t1->count == t2->count))
2763  t2->to = -1; /* eliminated */
2764  }
2765  }
2766  }
2767  }
2768  }
2769 
2770  /*
2771  * Check for all states that there aren't 2 transitions
2772  * with the same atom and a different target.
2773  */
2774  for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2775  state = ctxt->states[statenr];
2776  if (state == NULL)
2777  continue;
2778  if (state->nbTrans < 2)
2779  continue;
2780  last = NULL;
2781  for (transnr = 0;transnr < state->nbTrans;transnr++) {
2782  t1 = &(state->trans[transnr]);
2783  /*
2784  * Determinism checks in case of counted or all transitions
2785  * will have to be handled separately
2786  */
2787  if (t1->atom == NULL) {
2788  continue;
2789  }
2790  if (t1->to == -1) /* eliminated */
2791  continue;
2792  for (i = 0;i < transnr;i++) {
2793  t2 = &(state->trans[i]);
2794  if (t2->to == -1) /* eliminated */
2795  continue;
2796  if (t2->atom != NULL) {
2797  /*
2798  * But here we don't use deep because we want to
2799  * find transitions which indicate a conflict
2800  */
2801  if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2802  ret = 0;
2803  /* mark the transitions as non-deterministic ones */
2804  t1->nd = 1;
2805  t2->nd = 1;
2806  last = t1;
2807  }
2808  } else if (t1->to != -1) {
2809  /*
2810  * do the closure in case of remaining specific
2811  * epsilon transitions like choices or all
2812  */
2813  ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2814  t2->to, t2->atom);
2815  xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2816  /* don't shortcut the computation so all non deterministic
2817  transition get marked down
2818  if (ret == 0)
2819  return(0);
2820  */
2821  if (ret == 0) {
2822  t1->nd = 1;
2823  /* t2->nd = 1; */
2824  last = t1;
2825  }
2826  }
2827  }
2828  /* don't shortcut the computation so all non deterministic
2829  transition get marked down
2830  if (ret == 0)
2831  break; */
2832  }
2833 
2834  /*
2835  * mark specifically the last non-deterministic transition
2836  * from a state since there is no need to set-up rollback
2837  * from it
2838  */
2839  if (last != NULL) {
2840  last->nd = 2;
2841  }
2842 
2843  /* don't shortcut the computation so all non deterministic
2844  transition get marked down
2845  if (ret == 0)
2846  break; */
2847  }
2848 
2849  ctxt->determinist = ret;
2850  return(ret);
2851 }
2852 
2853 /************************************************************************
2854  * *
2855  * Routines to check input against transition atoms *
2856  * *
2857  ************************************************************************/
2858 
2859 static int
2860 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2861  int start, int end, const xmlChar *blockName) {
2862  int ret = 0;
2863 
2864  switch (type) {
2865  case XML_REGEXP_STRING:
2866  case XML_REGEXP_SUBREG:
2867  case XML_REGEXP_RANGES:
2868  case XML_REGEXP_EPSILON:
2869  return(-1);
2870  case XML_REGEXP_ANYCHAR:
2871  ret = ((codepoint != '\n') && (codepoint != '\r'));
2872  break;
2873  case XML_REGEXP_CHARVAL:
2874  ret = ((codepoint >= start) && (codepoint <= end));
2875  break;
2876  case XML_REGEXP_NOTSPACE:
2877  neg = !neg;
2878  /* Falls through. */
2879  case XML_REGEXP_ANYSPACE:
2880  ret = ((codepoint == '\n') || (codepoint == '\r') ||
2881  (codepoint == '\t') || (codepoint == ' '));
2882  break;
2883  case XML_REGEXP_NOTINITNAME:
2884  neg = !neg;
2885  /* Falls through. */
2886  case XML_REGEXP_INITNAME:
2887  ret = (IS_LETTER(codepoint) ||
2888  (codepoint == '_') || (codepoint == ':'));
2889  break;
2890  case XML_REGEXP_NOTNAMECHAR:
2891  neg = !neg;
2892  /* Falls through. */
2893  case XML_REGEXP_NAMECHAR:
2894  ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2895  (codepoint == '.') || (codepoint == '-') ||
2896  (codepoint == '_') || (codepoint == ':') ||
2897  IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2898  break;
2899  case XML_REGEXP_NOTDECIMAL:
2900  neg = !neg;
2901  /* Falls through. */
2902  case XML_REGEXP_DECIMAL:
2903  ret = xmlUCSIsCatNd(codepoint);
2904  break;
2905  case XML_REGEXP_REALCHAR:
2906  neg = !neg;
2907  /* Falls through. */
2908  case XML_REGEXP_NOTREALCHAR:
2909  ret = xmlUCSIsCatP(codepoint);
2910  if (ret == 0)
2911  ret = xmlUCSIsCatZ(codepoint);
2912  if (ret == 0)
2913  ret = xmlUCSIsCatC(codepoint);
2914  break;
2915  case XML_REGEXP_LETTER:
2916  ret = xmlUCSIsCatL(codepoint);
2917  break;
2918  case XML_REGEXP_LETTER_UPPERCASE:
2919  ret = xmlUCSIsCatLu(codepoint);
2920  break;
2921  case XML_REGEXP_LETTER_LOWERCASE:
2922  ret = xmlUCSIsCatLl(codepoint);
2923  break;
2924  case XML_REGEXP_LETTER_TITLECASE:
2925  ret = xmlUCSIsCatLt(codepoint);
2926  break;
2927  case XML_REGEXP_LETTER_MODIFIER:
2928  ret = xmlUCSIsCatLm(codepoint);
2929  break;
2930  case XML_REGEXP_LETTER_OTHERS:
2931  ret = xmlUCSIsCatLo(codepoint);
2932  break;
2933  case XML_REGEXP_MARK:
2934  ret = xmlUCSIsCatM(codepoint);
2935  break;
2936  case XML_REGEXP_MARK_NONSPACING:
2937  ret = xmlUCSIsCatMn(codepoint);
2938  break;
2939  case XML_REGEXP_MARK_SPACECOMBINING:
2940  ret = xmlUCSIsCatMc(codepoint);
2941  break;
2942  case XML_REGEXP_MARK_ENCLOSING:
2943  ret = xmlUCSIsCatMe(codepoint);
2944  break;
2945  case XML_REGEXP_NUMBER:
2946  ret = xmlUCSIsCatN(codepoint);
2947  break;
2948  case XML_REGEXP_NUMBER_DECIMAL:
2949  ret = xmlUCSIsCatNd(codepoint);
2950  break;
2951  case XML_REGEXP_NUMBER_LETTER:
2952  ret = xmlUCSIsCatNl(codepoint);
2953  break;
2954  case XML_REGEXP_NUMBER_OTHERS:
2955  ret = xmlUCSIsCatNo(codepoint);
2956  break;
2957  case XML_REGEXP_PUNCT:
2958  ret = xmlUCSIsCatP(codepoint);
2959  break;
2960  case XML_REGEXP_PUNCT_CONNECTOR:
2961  ret = xmlUCSIsCatPc(codepoint);
2962  break;
2963  case XML_REGEXP_PUNCT_DASH:
2964  ret = xmlUCSIsCatPd(codepoint);
2965  break;
2966  case XML_REGEXP_PUNCT_OPEN:
2967  ret = xmlUCSIsCatPs(codepoint);
2968  break;
2969  case XML_REGEXP_PUNCT_CLOSE:
2970  ret = xmlUCSIsCatPe(codepoint);
2971  break;
2972  case XML_REGEXP_PUNCT_INITQUOTE:
2973  ret = xmlUCSIsCatPi(codepoint);
2974  break;
2975  case XML_REGEXP_PUNCT_FINQUOTE:
2976  ret = xmlUCSIsCatPf(codepoint);
2977  break;
2978  case XML_REGEXP_PUNCT_OTHERS:
2979  ret = xmlUCSIsCatPo(codepoint);
2980  break;
2981  case XML_REGEXP_SEPAR:
2982  ret = xmlUCSIsCatZ(codepoint);
2983  break;
2984  case XML_REGEXP_SEPAR_SPACE:
2985  ret = xmlUCSIsCatZs(codepoint);
2986  break;
2987  case XML_REGEXP_SEPAR_LINE:
2988  ret = xmlUCSIsCatZl(codepoint);
2989  break;
2990  case XML_REGEXP_SEPAR_PARA:
2991  ret = xmlUCSIsCatZp(codepoint);
2992  break;
2993  case XML_REGEXP_SYMBOL:
2994  ret = xmlUCSIsCatS(codepoint);
2995  break;
2996  case XML_REGEXP_SYMBOL_MATH:
2997  ret = xmlUCSIsCatSm(codepoint);
2998  break;
2999  case XML_REGEXP_SYMBOL_CURRENCY:
3000  ret = xmlUCSIsCatSc(codepoint);
3001  break;
3002  case XML_REGEXP_SYMBOL_MODIFIER:
3003  ret = xmlUCSIsCatSk(codepoint);
3004  break;
3005  case XML_REGEXP_SYMBOL_OTHERS:
3006  ret = xmlUCSIsCatSo(codepoint);
3007  break;
3008  case XML_REGEXP_OTHER:
3009  ret = xmlUCSIsCatC(codepoint);
3010  break;
3011  case XML_REGEXP_OTHER_CONTROL:
3012  ret = xmlUCSIsCatCc(codepoint);
3013  break;
3014  case XML_REGEXP_OTHER_FORMAT:
3015  ret = xmlUCSIsCatCf(codepoint);
3016  break;
3017  case XML_REGEXP_OTHER_PRIVATE:
3018  ret = xmlUCSIsCatCo(codepoint);
3019  break;
3020  case XML_REGEXP_OTHER_NA:
3021  /* ret = xmlUCSIsCatCn(codepoint); */
3022  /* Seems it doesn't exist anymore in recent Unicode releases */
3023  ret = 0;
3024  break;
3025  case XML_REGEXP_BLOCK_NAME:
3026  ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
3027  break;
3028  }
3029  if (neg)
3030  return(!ret);
3031  return(ret);
3032 }
3033 
3034 static int
3035 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
3036  int i, ret = 0;
3037  xmlRegRangePtr range;
3038 
3039  if ((atom == NULL) || (!IS_CHAR(codepoint)))
3040  return(-1);
3041 
3042  switch (atom->type) {
3043  case XML_REGEXP_SUBREG:
3044  case XML_REGEXP_EPSILON:
3045  return(-1);
3046  case XML_REGEXP_CHARVAL:
3047  return(codepoint == atom->codepoint);
3048  case XML_REGEXP_RANGES: {
3049  int accept = 0;
3050 
3051  for (i = 0;i < atom->nbRanges;i++) {
3052  range = atom->ranges[i];
3053  if (range->neg == 2) {
3054  ret = xmlRegCheckCharacterRange(range->type, codepoint,
3055  0, range->start, range->end,
3056  range->blockName);
3057  if (ret != 0)
3058  return(0); /* excluded char */
3059  } else if (range->neg) {
3060  ret = xmlRegCheckCharacterRange(range->type, codepoint,
3061  0, range->start, range->end,
3062  range->blockName);
3063  if (ret == 0)
3064  accept = 1;
3065  else
3066  return(0);
3067  } else {
3068  ret = xmlRegCheckCharacterRange(range->type, codepoint,
3069  0, range->start, range->end,
3070  range->blockName);
3071  if (ret != 0)
3072  accept = 1; /* might still be excluded */
3073  }
3074  }
3075  return(accept);
3076  }
3077  case XML_REGEXP_STRING:
3078  printf("TODO: XML_REGEXP_STRING\n");
3079  return(-1);
3080  case XML_REGEXP_ANYCHAR:
3081  case XML_REGEXP_ANYSPACE:
3082  case XML_REGEXP_NOTSPACE:
3083  case XML_REGEXP_INITNAME:
3084  case XML_REGEXP_NOTINITNAME:
3085  case XML_REGEXP_NAMECHAR:
3086  case XML_REGEXP_NOTNAMECHAR:
3087  case XML_REGEXP_DECIMAL:
3088  case XML_REGEXP_NOTDECIMAL:
3089  case XML_REGEXP_REALCHAR:
3090  case XML_REGEXP_NOTREALCHAR:
3091  case XML_REGEXP_LETTER:
3092  case XML_REGEXP_LETTER_UPPERCASE:
3093  case XML_REGEXP_LETTER_LOWERCASE:
3094  case XML_REGEXP_LETTER_TITLECASE:
3095  case XML_REGEXP_LETTER_MODIFIER:
3096  case XML_REGEXP_LETTER_OTHERS:
3097  case XML_REGEXP_MARK:
3098  case XML_REGEXP_MARK_NONSPACING:
3099  case XML_REGEXP_MARK_SPACECOMBINING:
3100  case XML_REGEXP_MARK_ENCLOSING:
3101  case XML_REGEXP_NUMBER:
3102  case XML_REGEXP_NUMBER_DECIMAL:
3103  case XML_REGEXP_NUMBER_LETTER:
3104  case XML_REGEXP_NUMBER_OTHERS:
3105  case XML_REGEXP_PUNCT:
3106  case XML_REGEXP_PUNCT_CONNECTOR:
3107  case XML_REGEXP_PUNCT_DASH:
3108  case XML_REGEXP_PUNCT_OPEN:
3109  case XML_REGEXP_PUNCT_CLOSE:
3110  case XML_REGEXP_PUNCT_INITQUOTE:
3111  case XML_REGEXP_PUNCT_FINQUOTE:
3112  case XML_REGEXP_PUNCT_OTHERS:
3113  case XML_REGEXP_SEPAR:
3114  case XML_REGEXP_SEPAR_SPACE:
3115  case XML_REGEXP_SEPAR_LINE:
3116  case XML_REGEXP_SEPAR_PARA:
3117  case XML_REGEXP_SYMBOL:
3118  case XML_REGEXP_SYMBOL_MATH:
3119  case XML_REGEXP_SYMBOL_CURRENCY:
3120  case XML_REGEXP_SYMBOL_MODIFIER:
3121  case XML_REGEXP_SYMBOL_OTHERS:
3122  case XML_REGEXP_OTHER:
3123  case XML_REGEXP_OTHER_CONTROL:
3124  case XML_REGEXP_OTHER_FORMAT:
3125  case XML_REGEXP_OTHER_PRIVATE:
3126  case XML_REGEXP_OTHER_NA:
3127  case XML_REGEXP_BLOCK_NAME:
3128  ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3129  (const xmlChar *)atom->valuep);
3130  if (atom->neg)
3131  ret = !ret;
3132  break;
3133  }
3134  return(ret);
3135 }
3136 
3137 /************************************************************************
3138  * *
3139  * Saving and restoring state of an execution context *
3140  * *
3141  ************************************************************************/
3142 
3143 #ifdef DEBUG_REGEXP_EXEC
3144 static void
3145 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3146  printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3147  if (exec->inputStack != NULL) {
3148  int i;
3149  printf(": ");
3150  for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
3151  printf("%s ", (const char *)
3152  exec->inputStack[exec->inputStackNr - (i + 1)].value);
3153  } else {
3154  printf(": %s", &(exec->inputString[exec->index]));
3155  }
3156  printf("\n");
3157 }
3158 #endif
3159 
3160 static void
3161 xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3162 #ifdef DEBUG_REGEXP_EXEC
3163  printf("saving ");
3164  exec->transno++;
3165  xmlFARegDebugExec(exec);
3166  exec->transno--;
3167 #endif
3168 #ifdef MAX_PUSH
3169  if (exec->nbPush > MAX_PUSH) {
3170  return;
3171  }
3172  exec->nbPush++;
3173 #endif
3174 
3175  if (exec->maxRollbacks == 0) {
3176  exec->maxRollbacks = 4;
3177  exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3178  sizeof(xmlRegExecRollback));
3179  if (exec->rollbacks == NULL) {
3180  xmlRegexpErrMemory(NULL, "saving regexp");
3181  exec->maxRollbacks = 0;
3182  return;
3183  }
3184  memset(exec->rollbacks, 0,
3185  exec->maxRollbacks * sizeof(xmlRegExecRollback));
3186  } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3187  xmlRegExecRollback *tmp;
3188  int len = exec->maxRollbacks;
3189 
3190  exec->maxRollbacks *= 2;
3191  tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3192  exec->maxRollbacks * sizeof(xmlRegExecRollback));
3193  if (tmp == NULL) {
3194  xmlRegexpErrMemory(NULL, "saving regexp");
3195  exec->maxRollbacks /= 2;
3196  return;
3197  }
3198  exec->rollbacks = tmp;
3199  tmp = &exec->rollbacks[len];
3200  memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3201  }
3202  exec->rollbacks[exec->nbRollbacks].state = exec->state;
3203  exec->rollbacks[exec->nbRollbacks].index = exec->index;
3204  exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3205  if (exec->comp->nbCounters > 0) {
3206  if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3207  exec->rollbacks[exec->nbRollbacks].counts = (int *)
3208  xmlMalloc(exec->comp->nbCounters * sizeof(int));
3209  if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3210  xmlRegexpErrMemory(NULL, "saving regexp");
3211  exec->status = -5;
3212  return;
3213  }
3214  }
3215  memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3216  exec->comp->nbCounters * sizeof(int));
3217  }
3218  exec->nbRollbacks++;
3219 }
3220 
3221 static void
3222 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3223  if (exec->nbRollbacks <= 0) {
3224  exec->status = -1;
3225 #ifdef DEBUG_REGEXP_EXEC
3226  printf("rollback failed on empty stack\n");
3227 #endif
3228  return;
3229  }
3230  exec->nbRollbacks--;
3231  exec->state = exec->rollbacks[exec->nbRollbacks].state;
3232  exec->index = exec->rollbacks[exec->nbRollbacks].index;
3233  exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3234  if (exec->comp->nbCounters > 0) {
3235  if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3236  fprintf(stderr, "exec save: allocation failed");
3237  exec->status = -6;
3238  return;
3239  }
3240  if (exec->counts) {
3241  memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3242  exec->comp->nbCounters * sizeof(int));
3243  }
3244  }
3245 
3246 #ifdef DEBUG_REGEXP_EXEC
3247  printf("restored ");
3248  xmlFARegDebugExec(exec);
3249 #endif
3250 }
3251 
3252 /************************************************************************
3253  * *
3254  * Verifier, running an input against a compiled regexp *
3255  * *
3256  ************************************************************************/
3257 
3258 static int
3259 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3260  xmlRegExecCtxt execval;
3261  xmlRegExecCtxtPtr exec = &execval;
3262  int ret, codepoint = 0, len, deter;
3263 
3264  exec->inputString = content;
3265  exec->index = 0;
3266  exec->nbPush = 0;
3267  exec->determinist = 1;
3268  exec->maxRollbacks = 0;
3269  exec->nbRollbacks = 0;
3270  exec->rollbacks = NULL;
3271  exec->status = 0;
3272  exec->comp = comp;
3273  exec->state = comp->states[0];
3274  exec->transno = 0;
3275  exec->transcount = 0;
3276  exec->inputStack = NULL;
3277  exec->inputStackMax = 0;
3278  if (comp->nbCounters > 0) {
3279  exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3280  if (exec->counts == NULL) {
3281  xmlRegexpErrMemory(NULL, "running regexp");
3282  return(-1);
3283  }
3284  memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3285  } else
3286  exec->counts = NULL;
3287  while ((exec->status == 0) && (exec->state != NULL) &&
3288  ((exec->inputString[exec->index] != 0) ||
3289  ((exec->state != NULL) &&
3290  (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3291  xmlRegTransPtr trans;
3292  xmlRegAtomPtr atom;
3293 
3294  /*
3295  * If end of input on non-terminal state, rollback, however we may
3296  * still have epsilon like transition for counted transitions
3297  * on counters, in that case don't break too early. Additionally,
3298  * if we are working on a range like "AB{0,2}", where B is not present,
3299  * we don't want to break.
3300  */
3301  len = 1;
3302  if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3303  /*
3304  * if there is a transition, we must check if
3305  * atom allows minOccurs of 0
3306  */
3307  if (exec->transno < exec->state->nbTrans) {
3308  trans = &exec->state->trans[exec->transno];
3309  if (trans->to >=0) {
3310  atom = trans->atom;
3311  if (!((atom->min == 0) && (atom->max > 0)))
3312  goto rollback;
3313  }
3314  } else
3315  goto rollback;
3316  }
3317 
3318  exec->transcount = 0;
3319  for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3320  trans = &exec->state->trans[exec->transno];
3321  if (trans->to < 0)
3322  continue;
3323  atom = trans->atom;
3324  ret = 0;
3325  deter = 1;
3326  if (trans->count >= 0) {
3327  int count;
3328  xmlRegCounterPtr counter;
3329 
3330  if (exec->counts == NULL) {
3331  exec->status = -1;
3332  goto error;
3333  }
3334  /*
3335  * A counted transition.
3336  */
3337 
3338  count = exec->counts[trans->count];
3339  counter = &exec->comp->counters[trans->count];
3340 #ifdef DEBUG_REGEXP_EXEC
3341  printf("testing count %d: val %d, min %d, max %d\n",
3342  trans->count, count, counter->min, counter->max);
3343 #endif
3344  ret = ((count >= counter->min) && (count <= counter->max));
3345  if ((ret) && (counter->min != counter->max))
3346  deter = 0;
3347  } else if (atom == NULL) {
3348  fprintf(stderr, "epsilon transition left at runtime\n");
3349  exec->status = -2;
3350  break;
3351  } else if (exec->inputString[exec->index] != 0) {
3352  codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3353  ret = xmlRegCheckCharacter(atom, codepoint);
3354  if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3355  xmlRegStatePtr to = comp->states[trans->to];
3356 
3357  /*
3358  * this is a multiple input sequence
3359  * If there is a counter associated increment it now.
3360  * before potentially saving and rollback
3361  * do not increment if the counter is already over the
3362  * maximum limit in which case get to next transition
3363  */
3364  if (trans->counter >= 0) {
3365  xmlRegCounterPtr counter;
3366 
3367  if ((exec->counts == NULL) ||
3368  (exec->comp == NULL) ||
3369  (exec->comp->counters == NULL)) {
3370  exec->status = -1;
3371  goto error;
3372  }
3373  counter = &exec->comp->counters[trans->counter];
3374  if (exec->counts[trans->counter] >= counter->max)
3375  continue; /* for loop on transitions */
3376 
3377 #ifdef DEBUG_REGEXP_EXEC
3378  printf("Increasing count %d\n", trans->counter);
3379 #endif
3380  exec->counts[trans->counter]++;
3381  }
3382  if (exec->state->nbTrans > exec->transno + 1) {
3383  xmlFARegExecSave(exec);
3384  }
3385  exec->transcount = 1;
3386  do {
3387  /*
3388  * Try to progress as much as possible on the input
3389  */
3390  if (exec->transcount == atom->max) {
3391  break;
3392  }
3393  exec->index += len;
3394  /*
3395  * End of input: stop here
3396  */
3397  if (exec->inputString[exec->index] == 0) {
3398  exec->index -= len;
3399  break;
3400  }
3401  if (exec->transcount >= atom->min) {
3402  int transno = exec->transno;
3403  xmlRegStatePtr state = exec->state;
3404 
3405  /*
3406  * The transition is acceptable save it
3407  */
3408  exec->transno = -1; /* trick */
3409  exec->state = to;
3410  xmlFARegExecSave(exec);
3411  exec->transno = transno;
3412  exec->state = state;
3413  }
3414  codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3415  len);
3416  ret = xmlRegCheckCharacter(atom, codepoint);
3417  exec->transcount++;
3418  } while (ret == 1);
3419  if (exec->transcount < atom->min)
3420  ret = 0;
3421 
3422  /*
3423  * If the last check failed but one transition was found
3424  * possible, rollback
3425  */
3426  if (ret < 0)
3427  ret = 0;
3428  if (ret == 0) {
3429  goto rollback;
3430  }
3431  if (trans->counter >= 0) {
3432  if (exec->counts == NULL) {
3433  exec->status = -1;
3434  goto error;
3435  }
3436 #ifdef DEBUG_REGEXP_EXEC
3437  printf("Decreasing count %d\n", trans->counter);
3438 #endif
3439  exec->counts[trans->counter]--;
3440  }
3441  } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3442  /*
3443  * we don't match on the codepoint, but minOccurs of 0
3444  * says that's ok. Setting len to 0 inhibits stepping
3445  * over the codepoint.
3446  */
3447  exec->transcount = 1;
3448  len = 0;
3449  ret = 1;
3450  }
3451  } else if ((atom->min == 0) && (atom->max > 0)) {
3452  /* another spot to match when minOccurs is 0 */
3453  exec->transcount = 1;
3454  len = 0;
3455  ret = 1;
3456  }
3457  if (ret == 1) {
3458  if ((trans->nd == 1) ||
3459  ((trans->count >= 0) && (deter == 0) &&
3460  (exec->state->nbTrans > exec->transno + 1))) {
3461 #ifdef DEBUG_REGEXP_EXEC
3462  if (trans->nd == 1)
3463  printf("Saving on nd transition atom %d for %c at %d\n",
3464  trans->atom->no, codepoint, exec->index);
3465  else
3466  printf("Saving on counted transition count %d for %c at %d\n",
3467  trans->count, codepoint, exec->index);
3468 #endif
3469  xmlFARegExecSave(exec);
3470  }
3471  if (trans->counter >= 0) {
3472  xmlRegCounterPtr counter;
3473 
3474  /* make sure we don't go over the counter maximum value */
3475  if ((exec->counts == NULL) ||
3476  (exec->comp == NULL) ||
3477  (exec->comp->counters == NULL)) {
3478  exec->status = -1;
3479  goto error;
3480  }
3481  counter = &exec->comp->counters[trans->counter];
3482  if (exec->counts[trans->counter] >= counter->max)
3483  continue; /* for loop on transitions */
3484 #ifdef DEBUG_REGEXP_EXEC
3485  printf("Increasing count %d\n", trans->counter);
3486 #endif
3487  exec->counts[trans->counter]++;
3488  }
3489  if ((trans->count >= 0) &&
3490  (trans->count < REGEXP_ALL_COUNTER)) {
3491  if (exec->counts == NULL) {
3492  exec->status = -1;
3493  goto error;
3494  }
3495 #ifdef DEBUG_REGEXP_EXEC
3496  printf("resetting count %d on transition\n",
3497  trans->count);
3498 #endif
3499  exec->counts[trans->count] = 0;
3500  }
3501 #ifdef DEBUG_REGEXP_EXEC
3502  printf("entering state %d\n", trans->to);
3503 #endif
3504  exec->state = comp->states[trans->to];
3505  exec->transno = 0;
3506  if (trans->atom != NULL) {
3507  exec->index += len;
3508  }
3509  goto progress;
3510  } else if (ret < 0) {
3511  exec->status = -4;
3512  break;
3513  }
3514  }
3515  if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3516 rollback:
3517  /*
3518  * Failed to find a way out
3519  */
3520  exec->determinist = 0;
3521 #ifdef DEBUG_REGEXP_EXEC
3522  printf("rollback from state %d on %d:%c\n", exec->state->no,
3523  codepoint,codepoint);
3524 #endif
3525  xmlFARegExecRollBack(exec);
3526  }
3527 progress:
3528  continue;
3529  }
3530 error:
3531  if (exec->rollbacks != NULL) {
3532  if (exec->counts != NULL) {
3533  int i;
3534 
3535  for (i = 0;i < exec->maxRollbacks;i++)
3536  if (exec->rollbacks[i].counts != NULL)
3537  xmlFree(exec->rollbacks[i].counts);
3538  }
3539  xmlFree(exec->rollbacks);
3540  }
3541  if (exec->state == NULL)
3542  return(-1);
3543  if (exec->counts != NULL)
3544  xmlFree(exec->counts);
3545  if (exec->status == 0)
3546  return(1);
3547  if (exec->status == -1) {
3548  if (exec->nbPush > MAX_PUSH)
3549  return(-1);
3550  return(0);
3551  }
3552  return(exec->status);
3553 }
3554 
3555 /************************************************************************
3556  * *
3557  * Progressive interface to the verifier one atom at a time *
3558  * *
3559  ************************************************************************/
3560 #ifdef DEBUG_ERR
3561 static void testerr(xmlRegExecCtxtPtr exec);
3562 #endif
3563 
3575 xmlRegExecCtxtPtr
3576 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3577  xmlRegExecCtxtPtr exec;
3578 
3579  if (comp == NULL)
3580  return(NULL);
3581  if ((comp->compact == NULL) && (comp->states == NULL))
3582  return(NULL);
3583  exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3584  if (exec == NULL) {
3585  xmlRegexpErrMemory(NULL, "creating execution context");
3586  return(NULL);
3587  }
3588  memset(exec, 0, sizeof(xmlRegExecCtxt));
3589  exec->inputString = NULL;
3590  exec->index = 0;
3591  exec->determinist = 1;
3592  exec->maxRollbacks = 0;
3593  exec->nbRollbacks = 0;
3594  exec->rollbacks = NULL;
3595  exec->status = 0;
3596  exec->comp = comp;
3597  if (comp->compact == NULL)
3598  exec->state = comp->states[0];
3599  exec->transno = 0;
3600  exec->transcount = 0;
3601  exec->callback = callback;
3602  exec->data = data;
3603  if (comp->nbCounters > 0) {
3604  /*
3605  * For error handling, exec->counts is allocated twice the size
3606  * the second half is used to store the data in case of rollback
3607  */
3608  exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3609  * 2);
3610  if (exec->counts == NULL) {
3611  xmlRegexpErrMemory(NULL, "creating execution context");
3612  xmlFree(exec);
3613  return(NULL);
3614  }
3615  memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3616  exec->errCounts = &exec->counts[comp->nbCounters];
3617  } else {
3618  exec->counts = NULL;
3619  exec->errCounts = NULL;
3620  }
3621  exec->inputStackMax = 0;
3622  exec->inputStackNr = 0;
3623  exec->inputStack = NULL;
3624  exec->errStateNo = -1;
3625  exec->errString = NULL;
3626  exec->nbPush = 0;
3627  return(exec);
3628 }
3629 
3636 void
3637 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3638  if (exec == NULL)
3639  return;
3640 
3641  if (exec->rollbacks != NULL) {
3642  if (exec->counts != NULL) {
3643  int i;
3644 
3645  for (i = 0;i < exec->maxRollbacks;i++)
3646  if (exec->rollbacks[i].counts != NULL)
3647  xmlFree(exec->rollbacks[i].counts);
3648  }
3649  xmlFree(exec->rollbacks);
3650  }
3651  if (exec->counts != NULL)
3652  xmlFree(exec->counts);
3653  if (exec->inputStack != NULL) {
3654  int i;
3655 
3656  for (i = 0;i < exec->inputStackNr;i++) {
3657  if (exec->inputStack[i].value != NULL)
3658  xmlFree(exec->inputStack[i].value);
3659  }
3660  xmlFree(exec->inputStack);
3661  }
3662  if (exec->errString != NULL)
3663  xmlFree(exec->errString);
3664  xmlFree(exec);
3665 }
3666 
3667 static void
3668 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3669  void *data) {
3670 #ifdef DEBUG_PUSH
3671  printf("saving value: %d:%s\n", exec->inputStackNr, value);
3672 #endif
3673  if (exec->inputStackMax == 0) {
3674  exec->inputStackMax = 4;
3675  exec->inputStack = (xmlRegInputTokenPtr)
3676  xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3677  if (exec->inputStack == NULL) {
3678  xmlRegexpErrMemory(NULL, "pushing input string");
3679  exec->inputStackMax = 0;
3680  return;
3681  }
3682  } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3683  xmlRegInputTokenPtr tmp;
3684 
3685  exec->inputStackMax *= 2;
3686  tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3687  exec->inputStackMax * sizeof(xmlRegInputToken));
3688  if (tmp == NULL) {
3689  xmlRegexpErrMemory(NULL, "pushing input string");
3690  exec->inputStackMax /= 2;
3691  return;
3692  }
3693  exec->inputStack = tmp;
3694  }
3695  exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3696  exec->inputStack[exec->inputStackNr].data = data;
3697  exec->inputStackNr++;
3698  exec->inputStack[exec->inputStackNr].value = NULL;
3699  exec->inputStack[exec->inputStackNr].data = NULL;
3700 }
3701 
3715 static int
3716 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3717  if (expStr == valStr) return(1);
3718  if (expStr == NULL) return(0);
3719  if (valStr == NULL) return(0);
3720  do {
3721  /*
3722  * Eval if we have a wildcard for the current item.
3723  */
3724  if (*expStr != *valStr) {
3725  /* if one of them starts with a wildcard make valStr be it */
3726  if (*valStr == '*') {
3727  const xmlChar *tmp;
3728 
3729  tmp = valStr;
3730  valStr = expStr;
3731  expStr = tmp;
3732  }
3733  if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3734  do {
3735  if (*valStr == XML_REG_STRING_SEPARATOR)
3736  break;
3737  valStr++;
3738  } while (*valStr != 0);
3739  continue;
3740  } else
3741  return(0);
3742  }
3743  expStr++;
3744  valStr++;
3745  } while (*valStr != 0);
3746  if (*expStr != 0)
3747  return (0);
3748  else
3749  return (1);
3750 }
3751 
3764 static int
3765 xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3766  xmlRegexpPtr comp,
3767  const xmlChar *value,
3768  void *data) {
3769  int state = exec->index;
3770  int i, target;
3771 
3772  if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3773  return(-1);
3774 
3775  if (value == NULL) {
3776  /*
3777  * are we at a final state ?
3778  */
3779  if (comp->compact[state * (comp->nbstrings + 1)] ==
3780  XML_REGEXP_FINAL_STATE)
3781  return(1);
3782  return(0);
3783  }
3784 
3785 #ifdef DEBUG_PUSH
3786  printf("value pushed: %s\n", value);
3787 #endif
3788 
3789  /*
3790  * Examine all outside transitions from current state
3791  */
3792  for (i = 0;i < comp->nbstrings;i++) {
3793  target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3794  if ((target > 0) && (target <= comp->nbstates)) {
3795  target--; /* to avoid 0 */
3796  if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3797  exec->index = target;
3798  if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3799  exec->callback(exec->data, value,
3800  comp->transdata[state * comp->nbstrings + i], data);
3801  }
3802 #ifdef DEBUG_PUSH
3803  printf("entering state %d\n", target);
3804 #endif
3805  if (comp->compact[target * (comp->nbstrings + 1)] ==
3806  XML_REGEXP_SINK_STATE)
3807  goto error;
3808 
3809  if (comp->compact[target * (comp->nbstrings + 1)] ==
3810  XML_REGEXP_FINAL_STATE)
3811  return(1);
3812  return(0);
3813  }
3814  }
3815  }
3816  /*
3817  * Failed to find an exit transition out from current state for the
3818  * current token
3819  */
3820 #ifdef DEBUG_PUSH
3821  printf("failed to find a transition for %s on state %d\n", value, state);
3822 #endif
3823 error:
3824  if (exec->errString != NULL)
3825  xmlFree(exec->errString);
3826  exec->errString = xmlStrdup(value);
3827  exec->errStateNo = state;
3828  exec->status = -1;
3829 #ifdef DEBUG_ERR
3830  testerr(exec);
3831 #endif
3832  return(-1);
3833 }
3834 
3847 static int
3848 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3849  void *data, int compound) {
3850  xmlRegTransPtr trans;
3851  xmlRegAtomPtr atom;
3852  int ret;
3853  int final = 0;
3854  int progress = 1;
3855 
3856  if (exec == NULL)
3857  return(-1);
3858  if (exec->comp == NULL)
3859  return(-1);
3860  if (exec->status != 0)
3861  return(exec->status);
3862 
3863  if (exec->comp->compact != NULL)
3864  return(xmlRegCompactPushString(exec, exec->comp, value, data));
3865 
3866  if (value == NULL) {
3867  if (exec->state->type == XML_REGEXP_FINAL_STATE)
3868  return(1);
3869  final = 1;
3870  }
3871 
3872 #ifdef DEBUG_PUSH
3873  printf("value pushed: %s\n", value);
3874 #endif
3875  /*
3876  * If we have an active rollback stack push the new value there
3877  * and get back to where we were left
3878  */
3879  if ((value != NULL) && (exec->inputStackNr > 0)) {
3880  xmlFARegExecSaveInputString(exec, value, data);
3881  value = exec->inputStack[exec->index].value;
3882  data = exec->inputStack[exec->index].data;
3883 #ifdef DEBUG_PUSH
3884  printf("value loaded: %s\n", value);
3885 #endif
3886  }
3887 
3888  while ((exec->status == 0) &&
3889  ((value != NULL) ||
3890  ((final == 1) &&
3891  (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3892 
3893  /*
3894  * End of input on non-terminal state, rollback, however we may
3895  * still have epsilon like transition for counted transitions
3896  * on counters, in that case don't break too early.
3897  */
3898  if ((value == NULL) && (exec->counts == NULL))
3899  goto rollback;
3900 
3901  exec->transcount = 0;
3902  for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3903  trans = &exec->state->trans[exec->transno];
3904  if (trans->to < 0)
3905  continue;
3906  atom = trans->atom;
3907  ret = 0;
3908  if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3909  int i;
3910  int count;
3911  xmlRegTransPtr t;
3912  xmlRegCounterPtr counter;
3913 
3914  ret = 0;
3915 
3916 #ifdef DEBUG_PUSH
3917  printf("testing all lax %d\n", trans->count);
3918 #endif
3919  /*
3920  * Check all counted transitions from the current state
3921  */
3922  if ((value == NULL) && (final)) {
3923  ret = 1;
3924  } else if (value != NULL) {
3925  for (i = 0;i < exec->state->nbTrans;i++) {
3926  t = &exec->state->trans[i];
3927  if ((t->counter < 0) || (t == trans))
3928  continue;
3929  counter = &exec->comp->counters[t->counter];
3930  count = exec->counts[t->counter];
3931  if ((count < counter->max) &&
3932  (t->atom != NULL) &&
3933  (xmlStrEqual(value, t->atom->valuep))) {
3934  ret = 0;
3935  break;
3936  }
3937  if ((count >= counter->min) &&
3939  (t->atom != NULL) &&
3940  (xmlStrEqual(value, t->atom->valuep))) {
3941  ret = 1;
3942  break;
3943  }
3944  }
3945  }
3946  } else if (trans->count == REGEXP_ALL_COUNTER) {
3947  int i;
3948  int count;
3949  xmlRegTransPtr t;
3950  xmlRegCounterPtr counter;
3951 
3952  ret = 1;
3953 
3954 #ifdef DEBUG_PUSH
3955  printf("testing all %d\n", trans->count);
3956 #endif
3957  /*
3958  * Check all counted transitions from the current state
3959  */
3960  for (i = 0;i < exec->state->nbTrans;i++) {
3961  t = &exec->state->trans[i];
3962  if ((t->counter < 0) || (t == trans))
3963  continue;
3964  counter = &exec->comp->counters[t->counter];
3965  count = exec->counts[t->counter];
3966  if ((count < counter->min) || (count > counter->max)) {
3967  ret = 0;
3968  break;
3969  }
3970  }
3971  } else if (trans->count >= 0) {
3972  int count;
3973  xmlRegCounterPtr counter;
3974 
3975  /*
3976  * A counted transition.
3977  */
3978 
3979  count = exec->counts[trans->count];
3980  counter = &exec->comp->counters[trans->count];
3981 #ifdef DEBUG_PUSH
3982  printf("testing count %d: val %d, min %d, max %d\n",
3983  trans->count, count, counter->min, counter->max);
3984 #endif
3985  ret = ((count >= counter->min) && (count <= counter->max));
3986  } else if (atom == NULL) {
3987  fprintf(stderr, "epsilon transition left at runtime\n");
3988  exec->status = -2;
3989  break;
3990  } else if (value != NULL) {
3991  ret = xmlRegStrEqualWildcard(atom->valuep, value);
3992  if (atom->neg) {
3993  ret = !ret;
3994  if (!compound)
3995  ret = 0;
3996  }
3997  if ((ret == 1) && (trans->counter >= 0)) {
3998  xmlRegCounterPtr counter;
3999  int count;
4000 
4001  count = exec->counts[trans->counter];
4002  counter = &exec->comp->counters[trans->counter];
4003  if (count >= counter->max)
4004  ret = 0;
4005  }
4006 
4007  if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4008  xmlRegStatePtr to = exec->comp->states[trans->to];
4009 
4010  /*
4011  * this is a multiple input sequence
4012  */
4013  if (exec->state->nbTrans > exec->transno + 1) {
4014  if (exec->inputStackNr <= 0) {
4015  xmlFARegExecSaveInputString(exec, value, data);
4016  }
4017  xmlFARegExecSave(exec);
4018  }
4019  exec->transcount = 1;
4020  do {
4021  /*
4022  * Try to progress as much as possible on the input
4023  */
4024  if (exec->transcount == atom->max) {
4025  break;
4026  }
4027  exec->index++;
4028  value = exec->inputStack[exec->index].value;
4029  data = exec->inputStack[exec->index].data;
4030 #ifdef DEBUG_PUSH
4031  printf("value loaded: %s\n", value);
4032 #endif
4033 
4034  /*
4035  * End of input: stop here
4036  */
4037  if (value == NULL) {
4038  exec->index --;
4039  break;
4040  }
4041  if (exec->transcount >= atom->min) {
4042  int transno = exec->transno;
4043  xmlRegStatePtr state = exec->state;
4044 
4045  /*
4046  * The transition is acceptable save it
4047  */
4048  exec->transno = -1; /* trick */
4049  exec->state = to;
4050  if (exec->inputStackNr <= 0) {
4051  xmlFARegExecSaveInputString(exec, value, data);
4052  }
4053  xmlFARegExecSave(exec);
4054  exec->transno = transno;
4055  exec->state = state;
4056  }
4057  ret = xmlStrEqual(value, atom->valuep);
4058  exec->transcount++;
4059  } while (ret == 1);
4060  if (exec->transcount < atom->min)
4061  ret = 0;
4062 
4063  /*
4064  * If the last check failed but one transition was found
4065  * possible, rollback
4066  */
4067  if (ret < 0)
4068  ret = 0;
4069  if (ret == 0) {
4070  goto rollback;
4071  }
4072  }
4073  }
4074  if (ret == 1) {
4075  if ((exec->callback != NULL) && (atom != NULL) &&
4076  (data != NULL)) {
4077  exec->callback(exec->data, atom->valuep,
4078  atom->data, data);
4079  }
4080  if (exec->state->nbTrans > exec->transno + 1) {
4081  if (exec->inputStackNr <= 0) {
4082  xmlFARegExecSaveInputString(exec, value, data);
4083  }
4084  xmlFARegExecSave(exec);
4085  }
4086  if (trans->counter >= 0) {
4087 #ifdef DEBUG_PUSH
4088  printf("Increasing count %d\n", trans->counter);
4089 #endif
4090  exec->counts[trans->counter]++;
4091  }
4092  if ((trans->count >= 0) &&
4093  (trans->count < REGEXP_ALL_COUNTER)) {
4094 #ifdef DEBUG_REGEXP_EXEC
4095  printf("resetting count %d on transition\n",
4096  trans->count);
4097 #endif
4098  exec->counts[trans->count] = 0;
4099  }
4100 #ifdef DEBUG_PUSH
4101  printf("entering state %d\n", trans->to);
4102 #endif
4103  if ((exec->comp->states[trans->to] != NULL) &&
4104  (exec->comp->states[trans->to]->type ==
4105  XML_REGEXP_SINK_STATE)) {
4106  /*
4107  * entering a sink state, save the current state as error
4108  * state.
4109  */
4110  if (exec->errString != NULL)
4111  xmlFree(exec->errString);
4112  exec->errString = xmlStrdup(value);
4113  exec->errState = exec->state;
4114  memcpy(exec->errCounts, exec->counts,
4115  exec->comp->nbCounters * sizeof(int));
4116  }
4117  exec->state = exec->comp->states[trans->to];
4118  exec->transno = 0;
4119  if (trans->atom != NULL) {
4120  if (exec->inputStack != NULL) {
4121  exec->index++;
4122  if (exec->index < exec->inputStackNr) {
4123  value = exec->inputStack[exec->index].value;
4124  data = exec->inputStack[exec->index].data;
4125 #ifdef DEBUG_PUSH
4126  printf("value loaded: %s\n", value);
4127 #endif
4128  } else {
4129  value = NULL;
4130  data = NULL;
4131 #ifdef DEBUG_PUSH
4132  printf("end of input\n");
4133 #endif
4134  }
4135  } else {
4136  value = NULL;
4137  data = NULL;
4138 #ifdef DEBUG_PUSH
4139  printf("end of input\n");
4140 #endif
4141  }
4142  }
4143  goto progress;
4144  } else if (ret < 0) {
4145  exec->status = -4;
4146  break;
4147  }
4148  }
4149  if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4150 rollback:
4151  /*
4152  * if we didn't yet rollback on the current input
4153  * store the current state as the error state.
4154  */
4155  if ((progress) && (exec->state != NULL) &&
4156  (exec->state->type != XML_REGEXP_SINK_STATE)) {
4157  progress = 0;
4158  if (exec->errString != NULL)
4159  xmlFree(exec->errString);
4160  exec->errString = xmlStrdup(value);
4161  exec->errState = exec->state;
4162  if (exec->comp->nbCounters)
4163  memcpy(exec->errCounts, exec->counts,
4164  exec->comp->nbCounters * sizeof(int));
4165  }
4166 
4167  /*
4168  * Failed to find a way out
4169  */
4170  exec->determinist = 0;
4171  xmlFARegExecRollBack(exec);
4172  if ((exec->inputStack != NULL ) && (exec->status == 0)) {
4173  value = exec->inputStack[exec->index].value;
4174  data = exec->inputStack[exec->index].data;
4175 #ifdef DEBUG_PUSH
4176  printf("value loaded: %s\n", value);
4177 #endif
4178  }
4179  }
4180  continue;
4181 progress:
4182  progress = 1;
4183  continue;
4184  }
4185  if (exec->status == 0) {
4186  return(exec->state->type == XML_REGEXP_FINAL_STATE);
4187  }
4188 #ifdef DEBUG_ERR
4189  if (exec->status < 0) {
4190  testerr(exec);
4191  }
4192 #endif
4193  return(exec->status);
4194 }
4195 
4207 int
4208 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4209  void *data) {
4210  return(xmlRegExecPushStringInternal(exec, value, data, 0));
4211 }
4212 
4225 int
4226 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4227  const xmlChar *value2, void *data) {
4228  xmlChar buf[150];
4229  int lenn, lenp, ret;
4230  xmlChar *str;
4231 
4232  if (exec == NULL)
4233  return(-1);
4234  if (exec->comp == NULL)
4235  return(-1);
4236  if (exec->status != 0)
4237  return(exec->status);
4238 
4239  if (value2 == NULL)
4240  return(xmlRegExecPushString(exec, value, data));
4241 
4242  lenn = strlen((char *) value2);
4243  lenp = strlen((char *) value);
4244 
4245  if (150 < lenn + lenp + 2) {
4246  str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4247  if (str == NULL) {
4248  exec->status = -1;
4249  return(-1);
4250  }
4251  } else {
4252  str = buf;
4253  }
4254  memcpy(&str[0], value, lenp);
4255  str[lenp] = XML_REG_STRING_SEPARATOR;
4256  memcpy(&str[lenp + 1], value2, lenn);
4257  str[lenn + lenp + 1] = 0;
4258 
4259  if (exec->comp->compact != NULL)
4260  ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4261  else
4262  ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4263 
4264  if (str != buf)
4265  xmlFree(str);
4266  return(ret);
4267 }
4268 
4283 static int
4284 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4285  int *nbval, int *nbneg,
4286  xmlChar **values, int *terminal) {
4287  int maxval;
4288  int nb = 0;
4289 
4290  if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4291  (values == NULL) || (*nbval <= 0))
4292  return(-1);
4293 
4294  maxval = *nbval;
4295  *nbval = 0;
4296  *nbneg = 0;
4297  if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4298  xmlRegexpPtr comp;
4299  int target, i, state;
4300 
4301  comp = exec->comp;
4302 
4303  if (err) {
4304  if (exec->errStateNo == -1) return(-1);
4305  state = exec->errStateNo;
4306  } else {
4307  state = exec->index;
4308  }
4309  if (terminal != NULL) {
4310  if (comp->compact[state * (comp->nbstrings + 1)] ==
4311  XML_REGEXP_FINAL_STATE)
4312  *terminal = 1;
4313  else
4314  *terminal = 0;
4315  }
4316  for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4317  target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4318  if ((target > 0) && (target <= comp->nbstates) &&
4319  (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4320  XML_REGEXP_SINK_STATE)) {
4321  values[nb++] = comp->stringMap[i];
4322  (*nbval)++;
4323  }
4324  }
4325  for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4326  target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4327  if ((target > 0) && (target <= comp->nbstates) &&
4328  (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4329  XML_REGEXP_SINK_STATE)) {
4330  values[nb++] = comp->stringMap[i];
4331  (*nbneg)++;
4332  }
4333  }
4334  } else {
4335  int transno;
4336  xmlRegTransPtr trans;
4337  xmlRegAtomPtr atom;
4338  xmlRegStatePtr state;
4339 
4340  if (terminal != NULL) {
4341  if (exec->state->type == XML_REGEXP_FINAL_STATE)
4342  *terminal = 1;
4343  else
4344  *terminal = 0;
4345  }
4346 
4347  if (err) {
4348  if (exec->errState == NULL) return(-1);
4349  state = exec->errState;
4350  } else {
4351  if (exec->state == NULL) return(-1);
4352  state = exec->state;
4353  }
4354  for (transno = 0;
4355  (transno < state->nbTrans) && (nb < maxval);
4356  transno++) {
4357  trans = &state->trans[transno];
4358  if (trans->to < 0)
4359  continue;
4360  atom = trans->atom;
4361  if ((atom == NULL) || (atom->valuep == NULL))
4362  continue;
4363  if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4364  /* this should not be reached but ... */
4365  TODO;
4366  } else if (trans->count == REGEXP_ALL_COUNTER) {
4367  /* this should not be reached but ... */
4368  TODO;
4369  } else if (trans->counter >= 0) {
4370  xmlRegCounterPtr counter = NULL;
4371  int count;
4372 
4373  if (err)
4374  count = exec->errCounts[trans->counter];
4375  else
4376  count = exec->counts[trans->counter];
4377  if (exec->comp != NULL)
4378  counter = &exec->comp->counters[trans->counter];
4379  if ((counter == NULL) || (count < counter->max)) {
4380  if (atom->neg)
4381  values[nb++] = (xmlChar *) atom->valuep2;
4382  else
4383  values[nb++] = (xmlChar *) atom->valuep;
4384  (*nbval)++;
4385  }
4386  } else {
4387  if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4388  (exec->comp->states[trans->to]->type !=
4389  XML_REGEXP_SINK_STATE)) {
4390  if (atom->neg)
4391  values[nb++] = (xmlChar *) atom->valuep2;
4392  else
4393  values[nb++] = (xmlChar *) atom->valuep;
4394  (*nbval)++;
4395  }
4396  }
4397  }
4398  for (transno = 0;
4399  (transno < state->nbTrans) && (nb < maxval);
4400  transno++) {
4401  trans = &state->trans[transno];
4402  if (trans->to < 0)
4403  continue;
4404  atom = trans->atom;
4405  if ((atom == NULL) || (atom->valuep == NULL))
4406  continue;
4407  if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4408  continue;
4409  } else if (trans->count == REGEXP_ALL_COUNTER) {
4410  continue;
4411  } else if (trans->counter >= 0) {
4412  continue;
4413  } else {
4414  if ((exec->comp->states[trans->to] != NULL) &&
4415  (exec->comp->states[trans->to]->type ==
4416  XML_REGEXP_SINK_STATE)) {
4417  if (atom->neg)
4418  values[nb++] = (xmlChar *) atom->valuep2;
4419  else
4420  values[nb++] = (xmlChar *) atom->valuep;
4421  (*nbneg)++;
4422  }
4423  }
4424  }
4425  }
4426  return(0);
4427 }
4428 
4446 int
4447 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4448  xmlChar **values, int *terminal) {
4449  return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4450 }
4451 
4471 int
4472 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4473  int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4474  if (exec == NULL)
4475  return(-1);
4476  if (string != NULL) {
4477  if (exec->status != 0)
4478  *string = exec->errString;
4479  else
4480  *string = NULL;
4481  }
4482  return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4483 }
4484 
4485 #ifdef DEBUG_ERR
4486 static void testerr(xmlRegExecCtxtPtr exec) {
4487  const xmlChar *string;
4488  xmlChar *values[5];
4489  int nb = 5;
4490  int nbneg;
4491  int terminal;
4492  xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4493 }
4494 #endif
4495 
4496 #if 0
4497 static int
4498 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4499  xmlRegTransPtr trans;
4500  xmlRegAtomPtr atom;
4501  int ret;
4502  int codepoint, len;
4503 
4504  if (exec == NULL)
4505  return(-1);
4506  if (exec->status != 0)
4507  return(exec->status);
4508 
4509  while ((exec->status == 0) &&
4510  ((exec->inputString[exec->index] != 0) ||
4511  (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4512 
4513  /*
4514  * End of input on non-terminal state, rollback, however we may
4515  * still have epsilon like transition for counted transitions
4516  * on counters, in that case don't break too early.
4517  */
4518  if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4519  goto rollback;
4520 
4521  exec->transcount = 0;
4522  for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4523  trans = &exec->state->trans[exec->transno];
4524  if (trans->to < 0)
4525  continue;
4526  atom = trans->atom;
4527  ret = 0;
4528  if (trans->count >= 0) {
4529  int count;
4530  xmlRegCounterPtr counter;
4531 
4532  /*
4533  * A counted transition.
4534  */
4535 
4536  count = exec->counts[trans->count];
4537  counter = &exec->comp->counters[trans->count];
4538 #ifdef DEBUG_REGEXP_EXEC
4539  printf("testing count %d: val %d, min %d, max %d\n",
4540  trans->count, count, counter->min, counter->max);
4541 #endif
4542  ret = ((count >= counter->min) && (count <= counter->max));
4543  } else if (atom == NULL) {
4544  fprintf(stderr, "epsilon transition left at runtime\n");
4545  exec->status = -2;
4546  break;
4547  } else if (exec->inputString[exec->index] != 0) {
4548  codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4549  ret = xmlRegCheckCharacter(atom, codepoint);
4550  if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4551  xmlRegStatePtr to = exec->comp->states[trans->to];
4552 
4553  /*
4554  * this is a multiple input sequence
4555  */
4556  if (exec->state->nbTrans > exec->transno + 1) {
4557  xmlFARegExecSave(exec);
4558  }
4559  exec->transcount = 1;
4560  do {
4561  /*
4562  * Try to progress as much as possible on the input
4563  */
4564  if (exec->transcount == atom->max) {
4565  break;
4566  }
4567  exec->index += len;
4568  /*
4569  * End of input: stop here
4570  */
4571  if (exec->inputString[exec->index] == 0) {
4572  exec->index -= len;
4573  break;
4574  }
4575  if (exec->transcount >= atom->min) {
4576  int transno = exec->transno;
4577  xmlRegStatePtr state = exec->state;
4578 
4579  /*
4580  * The transition is acceptable save it
4581  */
4582  exec->transno = -1; /* trick */
4583  exec->state = to;
4584  xmlFARegExecSave(exec);
4585  exec->transno = transno;
4586  exec->state = state;
4587  }
4588  codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4589  len);
4590  ret = xmlRegCheckCharacter(atom, codepoint);
4591  exec->transcount++;
4592  } while (ret == 1);
4593  if (exec->transcount < atom->min)
4594  ret = 0;
4595 
4596  /*
4597  * If the last check failed but one transition was found
4598  * possible, rollback
4599  */
4600  if (ret < 0)
4601  ret = 0;
4602  if (ret == 0) {
4603  goto rollback;
4604  }
4605  }
4606  }
4607  if (ret == 1) {
4608  if (exec->state->nbTrans > exec->transno + 1) {
4609  xmlFARegExecSave(exec);
4610  }
4611  /*
4612  * restart count for expressions like this ((abc){2})*
4613  */
4614  if (trans->count >= 0) {
4615 #ifdef DEBUG_REGEXP_EXEC
4616  printf("Reset count %d\n", trans->count);
4617 #endif
4618  exec->counts[trans->count] = 0;
4619  }
4620  if (trans->counter >= 0) {
4621 #ifdef DEBUG_REGEXP_EXEC
4622  printf("Increasing count %d\n", trans->counter);
4623 #endif
4624  exec->counts[trans->counter]++;
4625  }
4626 #ifdef DEBUG_REGEXP_EXEC
4627  printf("entering state %d\n", trans->to);
4628 #endif
4629  exec->state = exec->comp->states[trans->to];
4630  exec->transno = 0;
4631  if (trans->atom != NULL) {
4632  exec->index += len;
4633  }
4634  goto progress;
4635  } else if (ret < 0) {
4636  exec->status = -4;
4637  break;
4638  }
4639  }
4640  if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4641 rollback:
4642  /*
4643  * Failed to find a way out
4644  */
4645  exec->determinist = 0;
4646  xmlFARegExecRollBack(exec);
4647  }
4648 progress:
4649  continue;
4650  }
4651 }
4652 #endif
4653 /************************************************************************
4654  * *
4655  * Parser for the Schemas Datatype Regular Expressions *
4656  * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4657  * *
4658  ************************************************************************/
4659 
4666 static int
4667 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4668  int cur;
4669  int len;
4670 
4671  cur = CUR_SCHAR(ctxt->cur, len);
4672  if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4673  (cur == '*') || (cur == '+') || (cur == '(') ||
4674  (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4675  (cur == 0x5D) || (cur == 0))
4676  return(-1);
4677  return(cur);
4678 }
4679 
4696 static void
4697 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4698  int cur;
4699  xmlRegAtomType type = (xmlRegAtomType) 0;
4700  xmlChar *blockName = NULL;
4701 
4702  cur = CUR;
4703  if (cur == 'L') {
4704  NEXT;
4705  cur = CUR;
4706  if (cur == 'u') {
4707  NEXT;
4708  type = XML_REGEXP_LETTER_UPPERCASE;
4709  } else if (cur == 'l') {
4710  NEXT;
4711  type = XML_REGEXP_LETTER_LOWERCASE;
4712  } else if (cur == 't') {
4713  NEXT;
4714  type = XML_REGEXP_LETTER_TITLECASE;
4715  } else if (cur == 'm') {
4716  NEXT;
4717  type = XML_REGEXP_LETTER_MODIFIER;
4718  } else if (cur == 'o') {
4719  NEXT;
4720  type = XML_REGEXP_LETTER_OTHERS;
4721  } else {
4722  type = XML_REGEXP_LETTER;
4723  }
4724  } else if (cur == 'M') {
4725  NEXT;
4726  cur = CUR;
4727  if (cur == 'n') {
4728  NEXT;
4729  /* nonspacing */
4730  type = XML_REGEXP_MARK_NONSPACING;
4731  } else if (cur == 'c') {
4732  NEXT;
4733  /* spacing combining */
4734  type = XML_REGEXP_MARK_SPACECOMBINING;
4735  } else if (cur == 'e') {
4736  NEXT;
4737  /* enclosing */
4738  type = XML_REGEXP_MARK_ENCLOSING;
4739  } else {
4740  /* all marks */
4741  type = XML_REGEXP_MARK;
4742  }
4743  } else if (cur == 'N') {
4744  NEXT;
4745  cur = CUR;
4746  if (cur == 'd') {
4747  NEXT;
4748  /* digital */
4749  type = XML_REGEXP_NUMBER_DECIMAL;
4750  } else if (cur == 'l') {
4751  NEXT;
4752  /* letter */
4753  type = XML_REGEXP_NUMBER_LETTER;
4754  } else if (cur == 'o') {
4755  NEXT;
4756  /* other */
4757  type = XML_REGEXP_NUMBER_OTHERS;
4758  } else {
4759  /* all numbers */
4760  type = XML_REGEXP_NUMBER;
4761  }
4762  } else if (cur == 'P') {
4763  NEXT;
4764  cur = CUR;
4765  if (cur == 'c') {
4766  NEXT;
4767  /* connector */
4768  type = XML_REGEXP_PUNCT_CONNECTOR;
4769  } else if (cur == 'd') {
4770  NEXT;
4771  /* dash */
4772  type = XML_REGEXP_PUNCT_DASH;
4773  } else if (cur == 's') {
4774  NEXT;
4775  /* open */
4776  type = XML_REGEXP_PUNCT_OPEN;
4777  } else if (cur == 'e') {
4778  NEXT;
4779  /* close */
4780  type = XML_REGEXP_PUNCT_CLOSE;
4781  } else if (cur == 'i') {
4782  NEXT;
4783  /* initial quote */
4784  type = XML_REGEXP_PUNCT_INITQUOTE;
4785  } else if (cur == 'f') {
4786  NEXT;
4787  /* final quote */
4788  type = XML_REGEXP_PUNCT_FINQUOTE;
4789  } else if (cur == 'o') {
4790  NEXT;
4791  /* other */
4792  type = XML_REGEXP_PUNCT_OTHERS;
4793  } else {
4794  /* all punctuation */
4795  type = XML_REGEXP_PUNCT;
4796  }
4797  } else if (cur == 'Z') {
4798  NEXT;
4799  cur = CUR;
4800  if (cur == 's') {
4801  NEXT;
4802  /* space */
4803  type = XML_REGEXP_SEPAR_SPACE;
4804  } else if (cur == 'l') {
4805  NEXT;
4806  /* line */
4807  type = XML_REGEXP_SEPAR_LINE;
4808  } else if (cur == 'p') {
4809  NEXT;
4810  /* paragraph */
4811  type = XML_REGEXP_SEPAR_PARA;
4812  } else {
4813  /* all separators */
4814  type = XML_REGEXP_SEPAR;
4815  }
4816  } else if (cur == 'S') {
4817  NEXT;
4818  cur = CUR;
4819  if (cur == 'm') {
4820  NEXT;
4821  type = XML_REGEXP_SYMBOL_MATH;
4822  /* math */
4823  } else if (cur == 'c') {
4824  NEXT;
4825  type = XML_REGEXP_SYMBOL_CURRENCY;
4826  /* currency */
4827  } else if (cur == 'k') {
4828  NEXT;
4829  type = XML_REGEXP_SYMBOL_MODIFIER;
4830  /* modifiers */
4831  } else if (cur == 'o') {
4832  NEXT;
4833  type = XML_REGEXP_SYMBOL_OTHERS;
4834  /* other */
4835  } else {
4836  /* all symbols */
4837  type = XML_REGEXP_SYMBOL;
4838  }
4839  } else if (cur == 'C') {
4840  NEXT;
4841  cur = CUR;
4842  if (cur == 'c') {
4843  NEXT;
4844  /* control */
4845  type = XML_REGEXP_OTHER_CONTROL;
4846  } else if (cur == 'f') {
4847  NEXT;
4848  /* format */
4849  type = XML_REGEXP_OTHER_FORMAT;
4850  } else if (cur == 'o') {
4851  NEXT;
4852  /* private use */
4853  type = XML_REGEXP_OTHER_PRIVATE;
4854  } else if (cur == 'n') {
4855  NEXT;
4856  /* not assigned */
4857  type = XML_REGEXP_OTHER_NA;
4858  } else {
4859  /* all others */
4860  type = XML_REGEXP_OTHER;
4861  }
4862  } else if (cur == 'I') {
4863  const xmlChar *start;
4864  NEXT;
4865  cur = CUR;
4866  if (cur != 's') {
4867  ERROR("IsXXXX expected");
4868  return;
4869  }
4870  NEXT;
4871  start = ctxt->cur;
4872  cur = CUR;
4873  if (((cur >= 'a') && (cur <= 'z')) ||
4874  ((cur >= 'A') && (cur <= 'Z')) ||
4875  ((cur >= '0') && (cur <= '9')) ||
4876  (cur == 0x2D)) {
4877  NEXT;
4878  cur = CUR;
4879  while (((cur >= 'a') && (cur <= 'z')) ||
4880  ((cur >= 'A') && (cur <= 'Z')) ||
4881  ((cur >= '0') && (cur <= '9')) ||
4882  (cur == 0x2D)) {
4883  NEXT;
4884  cur = CUR;
4885  }
4886  }
4887  type = XML_REGEXP_BLOCK_NAME;
4888  blockName = xmlStrndup(start, ctxt->cur - start);
4889  } else {
4890  ERROR("Unknown char property");
4891  return;
4892  }
4893  if (ctxt->atom == NULL) {
4894  ctxt->atom = xmlRegNewAtom(ctxt, type);
4895  if (ctxt->atom != NULL)
4896  ctxt->atom->valuep = blockName;
4897  } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4898  xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4899  type, 0, 0, blockName);
4900  }
4901 }
4902 
4913 static void
4914 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4915  int cur;
4916 
4917  if (CUR == '.') {
4918  if (ctxt->atom == NULL) {
4919  ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4920  } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4921  xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4922  XML_REGEXP_ANYCHAR, 0, 0, NULL);
4923  }
4924  NEXT;
4925  return;
4926  }
4927  if (CUR != '\\') {
4928  ERROR("Escaped sequence: expecting \\");
4929  return;
4930  }
4931  NEXT;
4932  cur = CUR;
4933  if (cur == 'p') {
4934  NEXT;
4935  if (CUR != '{') {
4936  ERROR("Expecting '{'");
4937  return;
4938  }
4939  NEXT;
4940  xmlFAParseCharProp(ctxt);
4941  if (CUR != '}') {
4942  ERROR("Expecting '}'");
4943  return;
4944  }
4945  NEXT;
4946  } else if (cur == 'P') {
4947  NEXT;
4948  if (CUR != '{') {
4949  ERROR("Expecting '{'");
4950  return;
4951  }
4952  NEXT;
4953  xmlFAParseCharProp(ctxt);
4954  if (ctxt->atom != NULL)
4955  ctxt->atom->neg = 1;
4956  if (CUR != '}') {
4957  ERROR("Expecting '}'");
4958  return;
4959  }
4960  NEXT;
4961  } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4962  (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4963  (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4964  (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4965  (cur == 0x5E)) {
4966  if (ctxt->atom == NULL) {
4967  ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4968  if (ctxt->atom != NULL) {
4969  switch (cur) {
4970  case 'n':
4971  ctxt->atom->codepoint = '\n';
4972  break;
4973  case 'r':
4974  ctxt->atom->codepoint = '\r';
4975  break;
4976  case 't':
4977  ctxt->atom->codepoint = '\t';
4978  break;
4979  default:
4980  ctxt->atom->codepoint = cur;
4981  }
4982  }
4983  } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4984  switch (cur) {
4985  case 'n':
4986  cur = '\n';
4987  break;
4988  case 'r':
4989  cur = '\r';
4990  break;
4991  case 't':
4992  cur = '\t';
4993  break;
4994  }
4995  xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4996  XML_REGEXP_CHARVAL, cur, cur, NULL);
4997  }
4998  NEXT;
4999  } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
5000  (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
5001  (cur == 'w') || (cur == 'W')) {
5002  xmlRegAtomType type = XML_REGEXP_ANYSPACE;
5003 
5004  switch (cur) {
5005  case 's':
5006  type = XML_REGEXP_ANYSPACE;
5007  break;
5008  case 'S':
5009  type = XML_REGEXP_NOTSPACE;
5010  break;
5011  case 'i':
5012  type = XML_REGEXP_INITNAME;
5013  break;
5014  case 'I':
5015  type = XML_REGEXP_NOTINITNAME;
5016  break;
5017  case 'c':
5018  type = XML_REGEXP_NAMECHAR;
5019  break;
5020  case 'C':
5021  type = XML_REGEXP_NOTNAMECHAR;
5022  break;
5023  case 'd':
5024  type = XML_REGEXP_DECIMAL;
5025  break;
5026  case 'D':
5027  type = XML_REGEXP_NOTDECIMAL;
5028  break;
5029  case 'w':
5030  type = XML_REGEXP_REALCHAR;
5031  break;
5032  case 'W':
5033  type = XML_REGEXP_NOTREALCHAR;
5034  break;
5035  }
5036  NEXT;
5037  if (ctxt->atom == NULL) {
5038  ctxt->atom = xmlRegNewAtom(ctxt, type);
5039  } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5040  xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5041  type, 0, 0, NULL);
5042  }
5043  } else {
5044  ERROR("Wrong escape sequence, misuse of character '\\'");
5045  }
5046 }
5047 
5058 static void
5059 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
5060  int cur, len;
5061  int start = -1;
5062  int end = -1;
5063 
5064  if (CUR == '\0') {
5065  ERROR("Expecting ']'");
5066  return;
5067  }
5068 
5069  cur = CUR;
5070  if (cur == '\\') {
5071  NEXT;
5072  cur = CUR;
5073  switch (cur) {
5074  case 'n': start = 0xA; break;
5075  case 'r': start = 0xD; break;
5076  case 't': start = 0x9; break;
5077  case '\\': case '|': case '.': case '-': case '^': case '?':
5078  case '*': case '+': case '{': case '}': case '(': case ')':
5079  case '[': case ']':
5080  start = cur; break;
5081  default:
5082  ERROR("Invalid escape value");
5083  return;
5084  }
5085  end = start;
5086  len = 1;
5087  } else if ((cur != 0x5B) && (cur != 0x5D)) {
5088  end = start = CUR_SCHAR(ctxt->cur, len);
5089  } else {
5090  ERROR("Expecting a char range");
5091  return;
5092  }
5093  /*
5094  * Since we are "inside" a range, we can assume ctxt->cur is past
5095  * the start of ctxt->string, and PREV should be safe
5096  */
5097  if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5098  NEXTL(len);
5099  return;
5100  }
5101  NEXTL(len);
5102  cur = CUR;
5103  if ((cur != '-') || (NXT(1) == ']')) {
5104  xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5105  XML_REGEXP_CHARVAL, start, end, NULL);
5106  return;
5107  }
5108  NEXT;
5109  cur = CUR;
5110  if (cur == '\\') {
5111  NEXT;
5112  cur = CUR;
5113  switch (cur) {
5114  case 'n': end = 0xA; break;
5115  case 'r': end = 0xD; break;
5116  case 't': end = 0x9; break;
5117  case '\\': case '|': case '.': case '-': case '^': case '?':
5118  case '*': case '+': case '{': case '}': case '(': case ')':
5119  case '[': case ']':
5120  end = cur; break;
5121  default:
5122  ERROR("Invalid escape value");
5123  return;
5124  }
5125  len = 1;
5126  } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
5127  end = CUR_SCHAR(ctxt->cur, len);
5128  } else {
5129  ERROR("Expecting the end of a char range");
5130  return;
5131  }
5132 
5133  /* TODO check that the values are acceptable character ranges for XML */
5134  if (end < start) {
5135  ERROR("End of range is before start of range");
5136  } else {
5137  NEXTL(len);
5138  xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5139  XML_REGEXP_CHARVAL, start, end, NULL);
5140  }
5141  return;
5142 }
5143 
5150 static void
5151 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5152  do {
5153  if (CUR == '\\') {
5154  xmlFAParseCharClassEsc(ctxt);
5155  } else {
5156  xmlFAParseCharRange(ctxt);
5157  }
5158  } while ((CUR != ']') && (CUR != '-') &&
5159  (CUR != 0) && (ctxt->error == 0));
5160 }
5161 
5171 static void
5172 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5173  int neg = ctxt->neg;
5174 
5175  if (CUR == '^') {
5176  NEXT;
5177  ctxt->neg = !ctxt->neg;
5178  xmlFAParsePosCharGroup(ctxt);
5179  ctxt->neg = neg;
5180  }
5181  while ((CUR != ']') && (ctxt->error == 0)) {
5182  if ((CUR == '-') && (NXT(1) == '[')) {
5183  NEXT; /* eat the '-' */
5184  NEXT; /* eat the '[' */
5185  ctxt->neg = 2;
5186  xmlFAParseCharGroup(ctxt);
5187  ctxt->neg = neg;
5188  if (CUR == ']') {
5189  NEXT;
5190  } else {
5191  ERROR("charClassExpr: ']' expected");
5192  }
5193  break;
5194  } else {
5195  xmlFAParsePosCharGroup(ctxt);
5196  }
5197  }
5198 }
5199 
5207 static void
5208 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5209  if (CUR == '[') {
5210  NEXT;
5211  ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5212  if (ctxt->atom == NULL)
5213  return;
5214  xmlFAParseCharGroup(ctxt);
5215  if (CUR == ']') {
5216  NEXT;
5217  } else {
5218  ERROR("xmlFAParseCharClass: ']' expected");
5219  }
5220  } else {
5221  xmlFAParseCharClassEsc(ctxt);
5222  }
5223 }
5224 
5233 static int
5234 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5235  int ret = 0;
5236  int ok = 0;
5237  int overflow = 0;
5238 
5239  while ((CUR >= '0') && (CUR <= '9')) {
5240  if (ret > INT_MAX / 10) {
5241  overflow = 1;
5242  } else {
5243  int digit = CUR - '0';
5244 
5245  ret *= 10;
5246  if (ret > INT_MAX - digit)
5247  overflow = 1;
5248  else
5249  ret += digit;
5250  }
5251  ok = 1;
5252  NEXT;
5253  }
5254  if ((ok != 1) || (overflow == 1)) {
5255  return(-1);
5256  }
5257  return(ret);
5258 }
5259 
5270 static int
5271 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5272  int cur;
5273 
5274  cur = CUR;
5275  if ((cur == '?') || (cur == '*') || (cur == '+')) {
5276  if (ctxt->atom != NULL) {
5277  if (cur == '?')
5278  ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5279  else if (cur == '*')
5280  ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5281  else if (cur == '+')
5282  ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5283  }
5284  NEXT;
5285  return(1);
5286  }
5287  if (cur == '{') {
5288  int min = 0, max = 0;
5289 
5290  NEXT;
5291  cur = xmlFAParseQuantExact(ctxt);
5292  if (cur >= 0)
5293  min = cur;
5294  else {
5295  ERROR("Improper quantifier");
5296  }
5297  if (CUR == ',') {
5298  NEXT;
5299  if (CUR == '}')
5300  max = INT_MAX;
5301  else {
5302  cur = xmlFAParseQuantExact(ctxt);
5303  if (cur >= 0)
5304  max = cur;
5305  else {
5306  ERROR("Improper quantifier");
5307  }
5308  }
5309  }
5310  if (CUR == '}') {
5311  NEXT;
5312  } else {
5313  ERROR("Unterminated quantifier");
5314  }
5315  if (max == 0)
5316  max = min;
5317  if (ctxt->atom != NULL) {
5318  ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5319  ctxt->atom->min = min;
5320  ctxt->atom->max = max;
5321  }
5322  return(1);
5323  }
5324  return(0);
5325 }
5326 
5333 static int
5334 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5335  int codepoint, len;
5336 
5337  codepoint = xmlFAIsChar(ctxt);
5338  if (codepoint > 0) {
5339  ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5340  if (ctxt->atom == NULL)
5341  return(-1);
5342  codepoint = CUR_SCHAR(ctxt->cur, len);
5343  ctxt->atom->codepoint = codepoint;
5344  NEXTL(len);
5345  return(1);
5346  } else if (CUR == '|') {
5347  return(0);
5348  } else if (CUR == 0) {
5349  return(0);
5350  } else if (CUR == ')') {
5351  return(0);
5352  } else if (CUR == '(') {
5353  xmlRegStatePtr start, oldend, start0;
5354 
5355  NEXT;
5356  if (ctxt->depth >= 50) {
5357  ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5358  return(-1);
5359  }
5360  /*
5361  * this extra Epsilon transition is needed if we count with 0 allowed
5362  * unfortunately this can't be known at that point
5363  */
5364  xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5365  start0 = ctxt->state;
5366  xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5367  start = ctxt->state;
5368  oldend = ctxt->end;
5369  ctxt->end = NULL;
5370  ctxt->atom = NULL;
5371  ctxt->depth++;
5372  xmlFAParseRegExp(ctxt, 0);
5373  ctxt->depth--;
5374  if (CUR == ')') {
5375  NEXT;
5376  } else {
5377  ERROR("xmlFAParseAtom: expecting ')'");
5378  }
5379  ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5380  if (ctxt->atom == NULL)
5381  return(-1);
5382  ctxt->atom->start = start;
5383  ctxt->atom->start0 = start0;
5384  ctxt->atom->stop = ctxt->state;
5385  ctxt->end = oldend;
5386  return(1);
5387  } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5388  xmlFAParseCharClass(ctxt);
5389  return(1);
5390  }
5391  return(0);
5392 }
5393 
5400 static int
5401 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5402  int ret;
5403 
5404  ctxt->atom = NULL;
5405  ret = xmlFAParseAtom(ctxt);
5406  if (ret == 0)
5407  return(0);
5408  if (ctxt->atom == NULL) {
5409  ERROR("internal: no atom generated");
5410  }
5411  xmlFAParseQuantifier(ctxt);
5412  return(1);
5413 }
5414 
5425 static int
5426 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5427  xmlRegStatePtr previous;
5428  int ret;
5429 
5430  previous = ctxt->state;
5431  ret = xmlFAParsePiece(ctxt);
5432  if (ret == 0) {
5433  /* Empty branch */
5434  xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5435  } else {
5436  if (xmlFAGenerateTransitions(ctxt, previous,
5437  (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0)
5438  return(-1);
5439  previous = ctxt->state;
5440  ctxt->atom = NULL;
5441  }
5442  while ((ret != 0) && (ctxt->error == 0)) {
5443  ret = xmlFAParsePiece(ctxt);
5444  if (ret != 0) {
5445  if (xmlFAGenerateTransitions(ctxt, previous,
5446  (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5447  ctxt->atom) < 0)
5448  return(-1);
5449  previous = ctxt->state;
5450  ctxt->atom = NULL;
5451  }
5452  }
5453  return(0);
5454 }
5455 
5463 static void
5464 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5465  xmlRegStatePtr start, end;
5466 
5467  /* if not top start should have been generated by an epsilon trans */
5468  start = ctxt->state;
5469  ctxt->end = NULL;
5470  xmlFAParseBranch(ctxt, NULL);
5471  if (top) {
5472 #ifdef DEBUG_REGEXP_GRAPH
5473  printf("State %d is final\n", ctxt->state->no);
5474 #endif
5475  ctxt->state->type = XML_REGEXP_FINAL_STATE;
5476  }
5477  if (CUR != '|') {
5478  ctxt->end = ctxt->state;
5479  return;
5480  }
5481  end = ctxt->state;
5482  while ((CUR == '|') && (ctxt->error == 0)) {
5483  NEXT;
5484  ctxt->state = start;
5485  ctxt->end = NULL;
5486  xmlFAParseBranch(ctxt, end);
5487  }
5488  if (!top) {
5489  ctxt->state = end;
5490  ctxt->end = end;
5491  }
5492 }
5493 
5494 /************************************************************************
5495  * *
5496  * The basic API *
5497  * *
5498  ************************************************************************/
5499 
5507 void
5508 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5509  int i;
5510 
5511  if (output == NULL)
5512  return;
5513  fprintf(output, " regexp: ");
5514  if (regexp == NULL) {
5515  fprintf(output, "NULL\n");
5516  return;
5517  }
5518  fprintf(output, "'%s' ", regexp->string);
5519  fprintf(output, "\n");
5520  fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5521  for (i = 0;i < regexp->nbAtoms; i++) {
5522  fprintf(output, " %02d ", i);
5523  xmlRegPrintAtom(output, regexp->atoms[i]);
5524  }
5525  fprintf(output, "%d states:", regexp->nbStates);
5526  fprintf(output, "\n");
5527  for (i = 0;i < regexp->nbStates; i++) {
5528  xmlRegPrintState(output, regexp->states[i]);
5529  }
5530  fprintf(output, "%d counters:\n", regexp->nbCounters);
5531  for (i = 0;i < regexp->nbCounters; i++) {
5532  fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5533  regexp->counters[i].max);
5534  }
5535 }
5536 
5547 xmlRegexpPtr
5548 xmlRegexpCompile(const xmlChar *regexp) {
5549  xmlRegexpPtr ret;
5550  xmlRegParserCtxtPtr ctxt;
5551 
5552  ctxt = xmlRegNewParserCtxt(regexp);
5553  if (ctxt == NULL)
5554  return(NULL);
5555 
5556  /* initialize the parser */
5557  ctxt->end = NULL;
5558  ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5559  xmlRegStatePush(ctxt, ctxt->start);
5560 
5561  /* parse the expression building an automata */
5562  xmlFAParseRegExp(ctxt, 1);
5563  if (CUR != 0) {
5564  ERROR("xmlFAParseRegExp: extra characters");
5565  }
5566  if (ctxt->error != 0) {
5567  xmlRegFreeParserCtxt(ctxt);
5568  return(NULL);
5569  }
5570  ctxt->end = ctxt->state;
5571  ctxt->start->type = XML_REGEXP_START_STATE;
5572  ctxt->end->type = XML_REGEXP_FINAL_STATE;
5573 
5574  /* remove the Epsilon except for counted transitions */
5575  xmlFAEliminateEpsilonTransitions(ctxt);
5576 
5577 
5578  if (ctxt->error != 0) {
5579  xmlRegFreeParserCtxt(ctxt);
5580  return(NULL);
5581  }
5582  ret = xmlRegEpxFromParse(ctxt);
5583  xmlRegFreeParserCtxt(ctxt);
5584  return(ret);
5585 }
5586 
5596 int
5597 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5598  if ((comp == NULL) || (content == NULL))
5599  return(-1);
5600  return(xmlFARegExec(comp, content));
5601 }
5602 
5611 int
5612 xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5613  xmlAutomataPtr am;
5614  int ret;
5615 
5616  if (comp == NULL)
5617  return(-1);
5618  if (comp->determinist != -1)
5619  return(comp->determinist);
5620 
5621  am = xmlNewAutomata();
5622  if (am == NULL)
5623  return(-1);
5624  if (am->states != NULL) {
5625  int i;
5626 
5627  for (i = 0;i < am->nbStates;i++)
5628  xmlRegFreeState(am->states[i]);
5629  xmlFree(am->states);
5630  }
5631  am->nbAtoms = comp->nbAtoms;
5632  am->atoms = comp->atoms;
5633  am->nbStates = comp->nbStates;
5634  am->states = comp->states;
5635  am->determinist = -1;
5636  am->flags = comp->flags;
5637  ret = xmlFAComputesDeterminism(am);
5638  am->atoms = NULL;
5639  am->states = NULL;
5640  xmlFreeAutomata(am);
5641  comp->determinist = ret;
5642  return(ret);
5643 }
5644 
5651 void
5652 xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5653  int i;
5654  if (regexp == NULL)
5655  return;
5656 
5657  if (regexp->string != NULL)
5658  xmlFree(regexp->string);
5659  if (regexp->states != NULL) {
5660  for (i = 0;i < regexp->nbStates;i++)
5661  xmlRegFreeState(regexp->states[i]);
5662  xmlFree(regexp->states);
5663  }
5664  if (regexp->atoms != NULL) {
5665  for (i = 0;i < regexp->nbAtoms;i++)
5666  xmlRegFreeAtom(regexp->atoms[i]);
5667  xmlFree(regexp->atoms);
5668  }
5669  if (regexp->counters != NULL)
5670  xmlFree(regexp->counters);
5671  if (regexp->compact != NULL)
5672  xmlFree(regexp->compact);
5673  if (regexp->transdata != NULL)
5674  xmlFree(regexp->transdata);
5675  if (regexp->stringMap != NULL) {
5676  for (i = 0; i < regexp->nbstrings;i++)
5677  xmlFree(regexp->stringMap[i]);
5678  xmlFree(regexp->stringMap);
5679  }
5680 
5681  xmlFree(regexp);
5682 }
5683 
5684 #ifdef LIBXML_AUTOMATA_ENABLED
5685 /************************************************************************
5686  * *
5687  * The Automata interface *
5688  * *
5689  ************************************************************************/
5690 
5698 xmlAutomataPtr
5699 xmlNewAutomata(void) {
5700  xmlAutomataPtr ctxt;
5701 
5702  ctxt = xmlRegNewParserCtxt(NULL);
5703  if (ctxt == NULL)
5704  return(NULL);
5705 
5706  /* initialize the parser */
5707  ctxt->end = NULL;
5708  ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5709  if (ctxt->start == NULL) {
5710  xmlFreeAutomata(ctxt);
5711  return(NULL);
5712  }
5713  ctxt->start->type = XML_REGEXP_START_STATE;
5714  if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5715  xmlRegFreeState(ctxt->start);
5716  xmlFreeAutomata(ctxt);
5717  return(NULL);
5718  }
5719  ctxt->flags = 0;
5720 
5721  return(ctxt);
5722 }
5723 
5730 void
5731 xmlFreeAutomata(xmlAutomataPtr am) {
5732  if (am == NULL)
5733  return;
5734  xmlRegFreeParserCtxt(am);
5735 }
5736 
5744 void
5745 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5746  if (am == NULL)
5747  return;
5748  am->flags |= flags;
5749 }
5750 
5759 xmlAutomataStatePtr
5760 xmlAutomataGetInitState(xmlAutomataPtr am) {
5761  if (am == NULL)
5762  return(NULL);
5763  return(am->start);
5764 }
5765 
5775 int
5776 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5777  if ((am == NULL) || (state == NULL))
5778  return(-1);
5779  state->type = XML_REGEXP_FINAL_STATE;
5780  return(0);
5781 }
5782 
5797 xmlAutomataStatePtr
5798 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5799  xmlAutomataStatePtr to, const xmlChar *token,
5800  void *data) {
5801  xmlRegAtomPtr atom;
5802 
5803  if ((am == NULL) || (from == NULL) || (token == NULL))
5804  return(NULL);
5805  atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5806  if (atom == NULL)
5807  return(NULL);
5808  atom->data = data;
5809  atom->valuep = xmlStrdup(token);
5810 
5811  if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5812  xmlRegFreeAtom(atom);
5813  return(NULL);
5814  }
5815  if (to == NULL)
5816  return(am->state);
5817  return(to);
5818 }
5819 
5835 xmlAutomataStatePtr
5836 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5837  xmlAutomataStatePtr to, const xmlChar *token,
5838  const xmlChar *token2, void *data) {
5839  xmlRegAtomPtr atom;
5840 
5841  if ((am == NULL) || (from == NULL) || (token == NULL))
5842  return(NULL);
5843  atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5844  if (atom == NULL)
5845  return(NULL);
5846  atom->data = data;
5847  if ((token2 == NULL) || (*token2 == 0)) {
5848  atom->valuep = xmlStrdup(token);
5849  } else {
5850  int lenn, lenp;
5851  xmlChar *str;
5852 
5853  lenn = strlen((char *) token2);
5854  lenp = strlen((char *) token);
5855 
5856  str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5857  if (str == NULL) {
5858  xmlRegFreeAtom(atom);
5859  return(NULL);
5860  }
5861  memcpy(&str[0], token, lenp);
5862  str[lenp] = '|';
5863  memcpy(&str[lenp + 1], token2, lenn);
5864  str[lenn + lenp + 1] = 0;
5865 
5866  atom->valuep = str;
5867  }
5868 
5869  if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5870  xmlRegFreeAtom(atom);
5871  return(NULL);
5872  }
5873  if (to == NULL)
5874  return(am->state);
5875  return(to);
5876 }
5877 
5895 xmlAutomataStatePtr
5896 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5897  xmlAutomataStatePtr to, const xmlChar *token,
5898  const xmlChar *token2, void *data) {
5899  xmlRegAtomPtr atom;
5900  xmlChar err_msg[200];
5901 
5902  if ((am == NULL) || (from == NULL) || (token == NULL))
5903  return(NULL);
5904  atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5905  if (atom == NULL)
5906  return(NULL);
5907  atom->data = data;
5908  atom->neg = 1;
5909  if ((token2 == NULL) || (*token2 == 0)) {
5910  atom->valuep = xmlStrdup(token);
5911  } else {
5912  int lenn, lenp;
5913  xmlChar *str;
5914 
5915  lenn = strlen((char *) token2);
5916  lenp = strlen((char *) token);
5917 
5918  str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5919  if (str == NULL) {
5920  xmlRegFreeAtom(atom);
5921  return(NULL);
5922  }
5923  memcpy(&str[0], token, lenp);
5924  str[lenp] = '|';
5925  memcpy(&str[lenp + 1], token2, lenn);
5926  str[lenn + lenp + 1] = 0;
5927 
5928  atom->valuep = str;
5929  }
5930  snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5931  err_msg[199] = 0;
5932  atom->valuep2 = xmlStrdup(err_msg);
5933 
5934  if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5935  xmlRegFreeAtom(atom);
5936  return(NULL);
5937  }
5938  am->negs++;
5939  if (to == NULL)
5940  return(am->state);
5941  return(to);
5942 }
5943 
5962 xmlAutomataStatePtr
5963 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5964  xmlAutomataStatePtr to, const xmlChar *token,
5965  const xmlChar *token2,
5966  int min, int max, void *data) {
5967  xmlRegAtomPtr atom;
5968  int counter;
5969 
5970  if ((am == NULL) || (from == NULL) || (token == NULL))
5971  return(NULL);
5972  if (min < 0)
5973  return(NULL);
5974  if ((max < min) || (max < 1))
5975  return(NULL);
5976  atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5977  if (atom == NULL)
5978  return(NULL);
5979  if ((token2 == NULL) || (*token2 == 0)) {
5980  atom->valuep = xmlStrdup(token);
5981  } else {
5982  int lenn, lenp;
5983  xmlChar *str;
5984 
5985  lenn = strlen((char *) token2);
5986  lenp = strlen((char *) token);
5987 
5988  str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5989  if (str == NULL) {
5990  xmlRegFreeAtom(atom);
5991  return(NULL);
5992  }
5993  memcpy(&str[0], token, lenp);
5994  str[lenp] = '|';
5995  memcpy(&str[lenp + 1], token2, lenn);
5996  str[lenn + lenp + 1] = 0;
5997 
5998  atom->valuep = str;
5999  }
6000  atom->data = data;
6001  if (min == 0)
6002  atom->min = 1;
6003  else
6004  atom->min = min;
6005  atom->max = max;
6006 
6007  /*
6008  * associate a counter to the transition.
6009  */
6010  counter = xmlRegGetCounter(am);
6011  am->counters[counter].min = min;
6012  am->counters[counter].max = max;
6013 
6014  /* xmlFAGenerateTransitions(am, from, to, atom); */
6015  if (to == NULL) {
6016  to = xmlRegNewState(am);
6017  xmlRegStatePush(am, to);
6018  }
6019  xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6020  xmlRegAtomPush(am, atom);
6021  am->state = to;
6022 
6023  if (to == NULL)
6024  to = am->state;
6025  if (to == NULL)
6026  return(NULL);
6027  if (min == 0)
6028  xmlFAGenerateEpsilonTransition(am, from, to);
6029  return(to);
6030 }
6031 
6049 xmlAutomataStatePtr
6050 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6051  xmlAutomataStatePtr to, const xmlChar *token,
6052  int min, int max, void *data) {
6053  xmlRegAtomPtr atom;
6054  int counter;
6055 
6056  if ((am == NULL) || (from == NULL) || (token == NULL))
6057  return(NULL);
6058  if (min < 0)
6059  return(NULL);
6060  if ((max < min) || (max < 1))
6061  return(NULL);
6062  atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6063  if (atom == NULL)
6064  return(NULL);
6065  atom->valuep = xmlStrdup(token);
6066  atom->data = data;
6067  if (min == 0)
6068  atom->min = 1;
6069  else
6070  atom->min = min;
6071  atom->max = max;
6072 
6073  /*
6074  * associate a counter to the transition.
6075  */
6076  counter = xmlRegGetCounter(am);
6077  am->counters[counter].min = min;
6078  am->counters[counter].max = max;
6079 
6080  /* xmlFAGenerateTransitions(am, from, to, atom); */
6081  if (to == NULL) {
6082  to = xmlRegNewState(am);
6083  xmlRegStatePush(am, to);
6084  }
6085  xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6086  xmlRegAtomPush(am, atom);
6087  am->state = to;
6088 
6089  if (to == NULL)
6090  to = am->state;
6091  if (to == NULL)
6092  return(NULL);
6093  if (min == 0)
6094  xmlFAGenerateEpsilonTransition(am, from, to);
6095  return(to);
6096 }
6097 
6117 xmlAutomataStatePtr
6118 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6119  xmlAutomataStatePtr to, const xmlChar *token,
6120  const xmlChar *token2,
6121  int min, int max, void *data) {
6122  xmlRegAtomPtr atom;
6123  int counter;
6124 
6125  if ((am == NULL) || (from == NULL) || (token == NULL))
6126  return(NULL);
6127  if (min < 1)
6128  return(NULL);
6129  if (max < min)
6130  return(NULL);
6131  atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6132  if (atom == NULL)
6133  return(NULL);
6134  if ((token2 == NULL) || (*token2 == 0)) {
6135  atom->valuep = xmlStrdup(token);
6136  } else {
6137  int lenn, lenp;
6138  xmlChar *str;
6139 
6140  lenn = strlen((char *) token2);
6141  lenp = strlen((char *) token);
6142 
6143  str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6144  if (str == NULL) {
6145  xmlRegFreeAtom(atom);
6146  return(NULL);
6147  }
6148  memcpy(&str[0], token, lenp);
6149  str[lenp] = '|';
6150  memcpy(&str[lenp + 1], token2, lenn);
6151  str[lenn + lenp + 1] = 0;
6152 
6153  atom->valuep = str;
6154  }
6155  atom->data = data;
6156  atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6157  atom->min = min;
6158  atom->max = max;
6159  /*
6160  * associate a counter to the transition.
6161  */
6162  counter = xmlRegGetCounter(am);
6163  am->counters[counter].min = 1;
6164  am->counters[counter].max = 1;
6165 
6166  /* xmlFAGenerateTransitions(am, from, to, atom); */
6167  if (to == NULL) {
6168  to = xmlRegNewState(am);
6169  xmlRegStatePush(am, to);
6170  }
6171  xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6172  xmlRegAtomPush(am, atom);
6173  am->state = to;
6174  return(to);
6175 }
6176 
6177 
6178 
6197 xmlAutomataStatePtr
6198 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6199  xmlAutomataStatePtr to, const xmlChar *token,
6200  int min, int max, void *data) {
6201  xmlRegAtomPtr atom;
6202  int counter;
6203 
6204  if ((am == NULL) || (from == NULL) || (token == NULL))
6205  return(NULL);
6206  if (min < 1)
6207  return(NULL);
6208  if (max < min)
6209  return(NULL);
6210  atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6211  if (atom == NULL)
6212  return(NULL);
6213  atom->valuep = xmlStrdup(token);
6214  atom->data = data;
6215  atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6216  atom->min = min;
6217  atom->max = max;
6218  /*
6219  * associate a counter to the transition.
6220  */