contrib: add ParseTrace prototype to parser skeleton
[project/jsonpath.git] / lexer.c
1 /*
2 * Copyright (C) 2013-2014 Jo-Philipp Wich <jow@openwrt.org>
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include <stdbool.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <ctype.h>
21
22 #include "ast.h"
23 #include "lexer.h"
24 #include "parser.h"
25
26
27 struct token {
28 int type;
29 const char *pat;
30 int plen;
31 int (*parse)(const char *buf, struct jp_opcode *op, struct jp_state *s);
32 };
33
34 #define dec(o) \
35 ((o) - '0')
36
37 #define hex(x) \
38 (((x) >= 'a') ? (10 + (x) - 'a') : \
39 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
40
41 /*
42 * Stores the given codepoint as a utf8 multibyte sequence into the given
43 * output buffer and substracts the required amount of bytes from the given
44 * length pointer.
45 *
46 * Returns false if the multibyte sequence would not fit into the buffer,
47 * otherwise true.
48 */
49
50 static bool
51 utf8enc(char **out, int *rem, int code)
52 {
53 if (code > 0 && code <= 0x7F)
54 {
55 if (*rem < 1)
56 return false;
57
58 *(*out++) = code; (*rem)--;
59 return true;
60 }
61 else if (code > 0 && code <= 0x7FF)
62 {
63 if (*rem < 2)
64 return false;
65
66 *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--;
67 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
68 return true;
69 }
70 else if (code > 0 && code <= 0xFFFF)
71 {
72 if (*rem < 3)
73 return false;
74
75 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
76 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--;
77 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
78 return true;
79 }
80 else if (code > 0 && code <= 0x10FFFF)
81 {
82 if (*rem < 4)
83 return false;
84
85 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
86 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
87 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--;
88 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
89 return true;
90 }
91
92 return true;
93 }
94
95
96 /*
97 * Parses a string literal from the given buffer.
98 *
99 * Returns a negative value on error, otherwise the amount of consumed
100 * characters from the given buffer.
101 *
102 * Error values:
103 * -1 Unterminated string
104 * -2 Invalid escape sequence
105 * -3 String literal too long
106 */
107
108 static int
109 parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
110 {
111 char q = *(buf++);
112 char str[128] = { 0 };
113 char *out = str;
114 const char *in = buf;
115 bool esc = false;
116 int rem = sizeof(str) - 1;
117 int code;
118
119 while (*in)
120 {
121 /* continuation of escape sequence */
122 if (esc)
123 {
124 /* \uFFFF */
125 if (in[0] == 'u')
126 {
127 if (isxdigit(in[1]) && isxdigit(in[2]) &&
128 isxdigit(in[3]) && isxdigit(in[4]))
129 {
130 if (!utf8enc(&out, &rem,
131 hex(in[1]) * 16 * 16 * 16 +
132 hex(in[2]) * 16 * 16 +
133 hex(in[3]) * 16 +
134 hex(in[4])))
135 {
136 s->error_pos = s->off + (in - buf);
137 return -3;
138 }
139
140 in += 5;
141 }
142 else
143 {
144 s->error_pos = s->off + (in - buf);
145 return -2;
146 }
147 }
148
149 /* \xFF */
150 else if (in[0] == 'x')
151 {
152 if (isxdigit(in[1]) && isxdigit(in[2]))
153 {
154 if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
155 {
156 s->error_pos = s->off + (in - buf);
157 return -3;
158 }
159
160 in += 3;
161 }
162 else
163 {
164 s->error_pos = s->off + (in - buf);
165 return -2;
166 }
167 }
168
169 /* \377, \77 or \7 */
170 else if (in[0] >= '0' && in[0] <= '7')
171 {
172 /* \377 */
173 if (in[1] >= '0' && in[1] <= '7' &&
174 in[2] >= '0' && in[2] <= '7')
175 {
176 code = dec(in[0]) * 8 * 8 +
177 dec(in[1]) * 8 +
178 dec(in[2]);
179
180 if (code > 255)
181 {
182 s->error_pos = s->off + (in - buf);
183 return -2;
184 }
185
186 if (!utf8enc(&out, &rem, code))
187 {
188 s->error_pos = s->off + (in - buf);
189 return -3;
190 }
191
192 in += 3;
193 }
194
195 /* \77 */
196 else if (in[1] >= '0' && in[1] <= '7')
197 {
198 if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
199 {
200 s->error_pos = s->off + (in - buf);
201 return -3;
202 }
203
204 in += 2;
205 }
206
207 /* \7 */
208 else
209 {
210 if (!utf8enc(&out, &rem, dec(in[0])))
211 {
212 s->error_pos = s->off + (in - buf);
213 return -3;
214 }
215
216 in += 1;
217 }
218 }
219
220 /* single character escape */
221 else
222 {
223 if (rem-- < 1)
224 {
225 s->error_pos = s->off + (in - buf);
226 return -3;
227 }
228
229 switch (in[0])
230 {
231 case 'a': *out = '\a'; break;
232 case 'b': *out = '\b'; break;
233 case 'e': *out = '\e'; break;
234 case 'f': *out = '\f'; break;
235 case 'n': *out = '\n'; break;
236 case 'r': *out = '\r'; break;
237 case 't': *out = '\t'; break;
238 case 'v': *out = '\v'; break;
239 default: *out = *in; break;
240 }
241
242 in++;
243 out++;
244 }
245
246 esc = false;
247 }
248
249 /* begin of escape sequence */
250 else if (*in == '\\')
251 {
252 in++;
253 esc = true;
254 }
255
256 /* terminating quote */
257 else if (*in == q)
258 {
259 op->str = strdup(str);
260 return (in - buf) + 2;
261 }
262
263 /* ordinary char */
264 else
265 {
266 if (rem-- < 1)
267 {
268 s->error_pos = s->off + (in - buf);
269 return -3;
270 }
271
272 *out++ = *in++;
273 }
274 }
275
276 return -1;
277 }
278
279
280 /*
281 * Parses a label from the given buffer.
282 *
283 * Returns a negative value on error, otherwise the amount of consumed
284 * characters from the given buffer.
285 *
286 * Error values:
287 * -3 Label too long
288 */
289
290 static int
291 parse_label(const char *buf, struct jp_opcode *op, struct jp_state *s)
292 {
293 char str[128] = { 0 };
294 char *out = str;
295 const char *in = buf;
296 int rem = sizeof(str) - 1;
297
298 while (*in == '_' || isalnum(*in))
299 {
300 if (rem-- < 1)
301 {
302 s->error_pos = s->off + (in - buf);
303 return -3;
304 }
305
306 *out++ = *in++;
307 }
308
309 if (!strcmp(str, "true") || !strcmp(str, "false"))
310 {
311 op->num = (str[0] == 't');
312 op->type = T_BOOL;
313 }
314 else
315 {
316 op->str = strdup(str);
317 }
318
319 return (in - buf);
320 }
321
322
323 /*
324 * Parses a number literal from the given buffer.
325 *
326 * Returns a negative value on error, otherwise the amount of consumed
327 * characters from the given buffer.
328 *
329 * Error values:
330 * -2 Invalid number character
331 */
332
333 static int
334 parse_number(const char *buf, struct jp_opcode *op, struct jp_state *s)
335 {
336 char *e;
337 int n = strtol(buf, &e, 10);
338
339 if (e == buf)
340 {
341 s->error_pos = s->off;
342 return -2;
343 }
344
345 op->num = n;
346
347 return (e - buf);
348 }
349
350 static const struct token tokens[] = {
351 { 0, " ", 1 },
352 { 0, "\t", 1 },
353 { 0, "\n", 1 },
354 { T_LE, "<=", 2 },
355 { T_GE, ">=", 2 },
356 { T_NE, "!=", 2 },
357 { T_AND, "&&", 2 },
358 { T_OR, "||", 2 },
359 { T_DOT, ".", 1 },
360 { T_BROPEN, "[", 1 },
361 { T_BRCLOSE, "]", 1 },
362 { T_POPEN, "(", 1 },
363 { T_PCLOSE, ")", 1 },
364 { T_UNION, ",", 1 },
365 { T_ROOT, "$", 1 },
366 { T_THIS, "@", 1 },
367 { T_LT, "<", 1 },
368 { T_GT, ">", 1 },
369 { T_EQ, "=", 1 },
370 { T_NOT, "!", 1 },
371 { T_WILDCARD, "*", 1 },
372 { T_STRING, "'", 1, parse_string },
373 { T_STRING, "\"", 1, parse_string },
374 { T_LABEL, "_", 1, parse_label },
375 { T_LABEL, "az", 0, parse_label },
376 { T_LABEL, "AZ", 0, parse_label },
377 { T_NUMBER, "-", 1, parse_number },
378 { T_NUMBER, "09", 0, parse_number },
379 };
380
381 const char *tokennames[23] = {
382 [0] = "End of file",
383 [T_AND] = "'&&'",
384 [T_OR] = "'||'",
385 [T_UNION] = "','",
386 [T_EQ] = "'='",
387 [T_NE] = "'!='",
388 [T_GT] = "'>'",
389 [T_GE] = "'>='",
390 [T_LT] = "'<'",
391 [T_LE] = "'<='",
392 [T_NOT] = "'!'",
393 [T_LABEL] = "Label",
394 [T_ROOT] = "'$'",
395 [T_THIS] = "'@'",
396 [T_DOT] = "'.'",
397 [T_WILDCARD] = "'*'",
398 [T_BROPEN] = "'['",
399 [T_BRCLOSE] = "']'",
400 [T_BOOL] = "Bool",
401 [T_NUMBER] = "Number",
402 [T_STRING] = "String",
403 [T_POPEN] = "'('",
404 [T_PCLOSE] = "')'",
405 };
406
407
408 static int
409 match_token(const char *ptr, struct jp_opcode *op, struct jp_state *s)
410 {
411 int i;
412 const struct token *tok;
413
414 for (i = 0, tok = &tokens[0];
415 i < sizeof(tokens) / sizeof(tokens[0]);
416 i++, tok = &tokens[i])
417 {
418 if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
419 (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1]))
420 {
421 op->type = tok->type;
422
423 if (tok->parse)
424 return tok->parse(ptr, op, s);
425
426 return tok->plen;
427 }
428 }
429
430 s->error_pos = s->off;
431 return -4;
432 }
433
434 struct jp_opcode *
435 jp_get_token(struct jp_state *s, const char *input, int *mlen)
436 {
437 struct jp_opcode op = { 0 };
438
439 *mlen = match_token(input, &op, s);
440
441 if (*mlen < 0)
442 {
443 s->error_code = *mlen;
444 return NULL;
445 }
446 else if (op.type == 0)
447 {
448 return NULL;
449 }
450
451 return jp_alloc_op(s, op.type, op.num, op.str, NULL);
452 }