ReactOS 0.4.15-dev-8058-ga7cbb60
tokenizer.hpp
Go to the documentation of this file.
1/*
2 * PROJECT: ReactOS host tools
3 * LICENSE: MIT (https://spdx.org/licenses/MIT)
4 * PURPOSE: Tokenizer class implementation
5 * COPYRIGHT: Copyright 2021 Timo Kreuzer <timo.kreuzer@reactos.org>
6 */
7
8#include <string>
9#include <vector>
10#include <fstream>
11#include <regex>
12#include <ctime>
13
14// Uncomment this for easier debugging
15#if 0
16#define throw __debugbreak(); throw
17#endif
18
19extern time_t search_time;
20
22{
23 int Type;
24 std::string RegExString;
25};
26
27class Token
28{
29 const std::string& m_text;
30 unsigned int m_pos;
31 unsigned int m_len;
32#if _DEBUG
33 std::string m_dbgstr;
34#endif
35 int m_type;
36
37public:
38
39 Token(const std::string& text, size_t pos, size_t len, int type)
40 : m_text(text),
41 m_pos(static_cast<unsigned int>(pos)),
42 m_len(static_cast<unsigned int>(len)),
44 {
45#if _DEBUG
46 m_dbgstr = str();
47#endif
48 }
49
50 std::string str() const
51 {
52 return m_text.substr(m_pos, m_len);
53 }
54
55 int type() const
56 {
57 return m_type;
58 }
59};
60
62{
63 const std::vector<TOKEN_DEF> &m_tokendefs;
64 const std::regex m_re;
65
66 typedef int myint;
67
68 static
69 unsigned int
70 count_captures(const std::string& exp)
71 {
72 bool in_char_group = false;
73 unsigned int count = 0;
74
75 for (size_t i = 0; i < exp.size(); i++)
76 {
77 char c = exp[i];
78
79 // Skip escaped characters
80 if (c == '\\')
81 {
82 i++;
83 continue;
84 }
85
86 if (in_char_group)
87 {
88 if (c == ']')
89 {
90 in_char_group = false;
91 }
92 continue;
93 }
94
95 if (c == '[')
96 {
97 in_char_group = true;
98 continue;
99 }
100
101 if (c == '(')
102 {
103 if (exp[i + 1] != '?')
104 {
105 count++;
106 }
107 }
108 }
109
110 return count;
111 }
112
113 static
114 std::regex
115 CompileMultiRegex(const std::vector<TOKEN_DEF> &tokendefs)
116 {
117 std::string combinedString;
118
119 if (tokendefs.size() == 0)
120 {
121 return std::regex();
122 }
123
124 // Validate all token definitions
125 for (auto def : tokendefs)
126 {
127 size_t found = -1;
128
129 // Count capture groups
130 unsigned int count = count_captures(def.RegExString);
131 if (count != 1)
132 {
133 throw "invalid count!\n";
134 }
135 }
136
137 // Combine all expressions into one (one capture group for each)
138 combinedString = "(?:" + tokendefs[0].RegExString + ")";
139 for (size_t i = 1; i < tokendefs.size(); i++)
140 {
141 combinedString += "|(?:" + tokendefs[i].RegExString + ")";
142 }
143
144 return std::regex(combinedString, std::regex_constants::icase);
145 }
146
147public:
148
150 {
151 unsigned int pos;
152 unsigned int len;
153 int type;
154 };
155
156 Tokenizer(std::vector<TOKEN_DEF> &tokendefs)
157 : m_tokendefs(tokendefs),
158 m_re(CompileMultiRegex(tokendefs))
159 {
160 }
161
162 TOKEN_REF match(std::smatch &matches, const std::string& str) const
163 {
164 return match(matches, str, 0);
165 }
166
167 TOKEN_REF match(std::smatch &matches, const std::string &str, size_t startpos) const
168 {
169 const std::string::const_iterator first = str.cbegin() + startpos;
170 const std::string::const_iterator last = str.cend();
171
172 // If we reached the end, there is nothing more to do
173 if (first == last)
174 {
175 return TOKEN_REF{ static_cast<unsigned int>(startpos), 0, -1 };
176 }
177
178 time_t start_time = time(NULL);
179
180 // Try to find a match
181 if (!std::regex_search(first, last, matches, m_re))
182 {
183 throw "Failed to match\n";
184 }
185
186 search_time += time(NULL) - start_time;
187
188 // Validate that it's at the start of the string
189 if (matches.prefix().matched)
190 {
191 throw "Failed to match at current position!\n";
192 }
193
194 // We have a match, check which one it is
195 for (size_t i = 1; i < matches.size(); i++)
196 {
197 if (matches[i].matched)
198 {
199 unsigned int len = static_cast<unsigned int>(matches.length(i));
200 int type = m_tokendefs[i - 1].Type;
201 return TOKEN_REF{ static_cast<unsigned int>(startpos), len, type};
202 }
203 }
204
205 // We should never get here
206 throw "Something went wrong!\n";
207 }
208};
209
210
212{
214
216 const std::string& m_text;
217 std::vector<TOKEN_REF> m_tokens;
218
219public:
220
221 TokenList(const Tokenizer& tokenizer, const std::string& text)
222 : m_tokenizer(tokenizer),
223 m_text(text)
224 {
225 size_t startpos = 0;
226 size_t len = m_text.size();
227 std::smatch matches;
228
229 m_tokens.reserve(len / 5);
230
231 while (startpos < len)
232 {
233 TOKEN_REF tref = m_tokenizer.match(matches, m_text, startpos);
234 m_tokens.push_back(tref);
235 startpos += tref.len;
236 };
237 }
238
239 size_t size() const
240 {
241 return m_tokens.size();
242 }
243
244 Token operator[](size_t n) const
245 {
247 }
248
249};
TokenList(const Tokenizer &tokenizer, const std::string &text)
Definition: tokenizer.hpp:221
std::vector< TOKEN_REF > m_tokens
Definition: tokenizer.hpp:217
const Tokenizer & m_tokenizer
Definition: tokenizer.hpp:215
typename Tokenizer::TOKEN_REF TOKEN_REF
Definition: tokenizer.hpp:213
Token operator[](size_t n) const
Definition: tokenizer.hpp:244
size_t size() const
Definition: tokenizer.hpp:239
const std::string & m_text
Definition: tokenizer.hpp:216
unsigned int m_len
Definition: tokenizer.hpp:31
unsigned int m_pos
Definition: tokenizer.hpp:30
const std::string & m_text
Definition: tokenizer.hpp:29
Token(const std::string &text, size_t pos, size_t len, int type)
Definition: tokenizer.hpp:39
int type() const
Definition: tokenizer.hpp:55
int m_type
Definition: tokenizer.hpp:35
std::string str() const
Definition: tokenizer.hpp:50
#define NULL
Definition: types.h:112
const WCHAR * text
Definition: package.c:1799
unsigned int(__cdecl typeof(jpeg_read_scanlines))(struct jpeg_decompress_struct *
Definition: typeof.h:31
__kernel_time_t time_t
Definition: linux.h:252
GLuint GLuint GLsizei GLenum type
Definition: gl.h:1545
GLuint GLuint GLsizei count
Definition: gl.h:1545
GLdouble n
Definition: glext.h:7729
const GLubyte * c
Definition: glext.h:8905
const GLint * first
Definition: glext.h:5794
GLenum GLsizei len
Definition: glext.h:6722
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
Definition: glfuncs.h:248
__u16 time
Definition: mkdosfs.c:8
#define matches(FN)
Definition: match.h:70
static UINT UINT last
Definition: font.c:45
static unsigned(__cdecl *hash_bstr)(bstr_t s)
DWORD exp
Definition: msg.c:16058
const WCHAR * str
std::string RegExString
Definition: tokenizer.hpp:24
unsigned int len
Definition: tokenizer.hpp:152
unsigned int pos
Definition: tokenizer.hpp:151
const std::regex m_re
Definition: tokenizer.hpp:64
static unsigned int count_captures(const std::string &exp)
Definition: tokenizer.hpp:70
TOKEN_REF match(std::smatch &matches, const std::string &str, size_t startpos) const
Definition: tokenizer.hpp:167
const std::vector< TOKEN_DEF > & m_tokendefs
Definition: tokenizer.hpp:63
TOKEN_REF match(std::smatch &matches, const std::string &str) const
Definition: tokenizer.hpp:162
static std::regex CompileMultiRegex(const std::vector< TOKEN_DEF > &tokendefs)
Definition: tokenizer.hpp:115
Tokenizer(std::vector< TOKEN_DEF > &tokendefs)
Definition: tokenizer.hpp:156
Definition: match.c:28
time_t search_time
Definition: asmpp.cpp:26