lexer.c

   1 /*
   2  * Copyright (C) 2013-2014 Jo-Philipp Wich <jow@openwrt.org>
   3  *
   4  * Permission to use, copy, modify, and/or distribute this software for any
   5  * purpose with or without fee is hereby granted, provided that the above
   6  * copyright notice and this permission notice appear in all copies.
   7  *
   8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15  */
  16
  17 #include <stdbool.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <ctype.h>
  21
  22 #include "ast.h"
  23 #include "lexer.h"
  24 #include "parser.h"
  25
  26
  27 struct token {
  28         int type;
  29         const char *pat;
  30         int plen;
  31         int (*parse)(const char *buf, struct jp_opcode *op, struct jp_state *s);
  32 };
  33
  34 #define dec(o) \
  35         ((o) - '0')
  36
  37 #define hex(x) \
  38         (((x) >= 'a') ? (10 + (x) - 'a') : \
  39                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
  40
  41 /*
  42  * Stores the given codepoint as a utf8 multibyte sequence into the given
  43  * output buffer and substracts the required amount of bytes from  the given
  44  * length pointer.
  45  *
  46  * Returns false if the multibyte sequence would not fit into the buffer,
  47  * otherwise true.
  48  */
  49
  50 static bool
  51 utf8enc(char **out, int *rem, int code)
  52 {
  53         if (code > 0 && code <= 0x7F)
  54         {
  55                 if (*rem < 1)
  56                         return false;
  57
  58                 *(*out++) = code; (*rem)--;
  59                 return true;
  60         }
  61         else if (code > 0 && code <= 0x7FF)
  62         {
  63                 if (*rem < 2)
  64                         return false;
  65
  66                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
  67                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
  68                 return true;
  69         }
  70         else if (code > 0 && code <= 0xFFFF)
  71         {
  72                 if (*rem < 3)
  73                         return false;
  74
  75                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
  76                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
  77                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
  78                 return true;
  79         }
  80         else if (code > 0 && code <= 0x10FFFF)
  81         {
  82                 if (*rem < 4)
  83                         return false;
  84
  85                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
  86                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
  87                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
  88                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
  89                 return true;
  90         }
  91
  92         return true;
  93 }
  94
  95
  96 /*
  97  * Parses a string literal from the given buffer.
  98  *
  99  * Returns a negative value on error, otherwise the amount of consumed
 100  * characters from the given buffer.
 101  *
 102  * Error values:
 103  *  -1  Unterminated string
 104  *  -2  Invalid escape sequence
 105  *  -3  String literal too long
 106  */
 107
 108 static int
 109 parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
 110 {
 111         char q = *(buf++);
 112         char str[128] = { 0 };
 113         char *out = str;
 114         const char *in = buf;
 115         bool esc = false;
 116         int rem = sizeof(str) - 1;
 117         int code;
 118
 119         while (*in)
 120         {
 121                 /* continuation of escape sequence */
 122                 if (esc)
 123                 {
 124                         /* \uFFFF */
 125                         if (in[0] == 'u')
 126                         {
 127                                 if (isxdigit(in[1]) && isxdigit(in[2]) &&
 128                                     isxdigit(in[3]) && isxdigit(in[4]))
 129                                 {
 130                                         if (!utf8enc(&out, &rem,
 131                                                      hex(in[1]) * 16 * 16 * 16 +
 132                                                      hex(in[2]) * 16 * 16 +
 133                                                      hex(in[3]) * 16 +
 134                                                      hex(in[4])))
 135                                         {
 136                                                 s->error_pos = s->off + (in - buf);
 137                                                 return -3;
 138                                         }
 139
 140                                         in += 5;
 141                                 }
 142                                 else
 143                                 {
 144                                         s->error_pos = s->off + (in - buf);
 145                                         return -2;
 146                                 }
 147                         }
 148
 149                         /* \xFF */
 150                         else if (in[0] == 'x')
 151                         {
 152                                 if (isxdigit(in[1]) && isxdigit(in[2]))
 153                                 {
 154                                         if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
 155                                         {
 156                                                 s->error_pos = s->off + (in - buf);
 157                                                 return -3;
 158                                         }
 159
 160                                         in += 3;
 161                                 }
 162                                 else
 163                                 {
 164                                         s->error_pos = s->off + (in - buf);
 165                                         return -2;
 166                                 }
 167                         }
 168
 169                         /* \377, \77 or \7 */
 170                         else if (in[0] >= '0' && in[0] <= '7')
 171                         {
 172                                 /* \377 */
 173                                 if (in[1] >= '0' && in[1] <= '7' &&
 174                                     in[2] >= '0' && in[2] <= '7')
 175                                 {
 176                                         code = dec(in[0]) * 8 * 8 +
 177                                                dec(in[1]) * 8 +
 178                                                dec(in[2]);
 179
 180                                         if (code > 255)
 181                                         {
 182                                                 s->error_pos = s->off + (in - buf);
 183                                                 return -2;
 184                                         }
 185
 186                                         if (!utf8enc(&out, &rem, code))
 187                                         {
 188                                                 s->error_pos = s->off + (in - buf);
 189                                                 return -3;
 190                                         }
 191
 192                                         in += 3;
 193                                 }
 194
 195                                 /* \77 */
 196                                 else if (in[1] >= '0' && in[1] <= '7')
 197                                 {
 198                                         if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
 199                                         {
 200                                                 s->error_pos = s->off + (in - buf);
 201                                                 return -3;
 202                                         }
 203
 204                                         in += 2;
 205                                 }
 206
 207                                 /* \7 */
 208                                 else
 209                                 {
 210                                         if (!utf8enc(&out, &rem, dec(in[0])))
 211                                         {
 212                                                 s->error_pos = s->off + (in - buf);
 213                                                 return -3;
 214                                         }
 215
 216                                         in += 1;
 217                                 }
 218                         }
 219
 220                         /* single character escape */
 221                         else
 222                         {
 223                                 if (rem-- < 1)
 224                                 {
 225                                         s->error_pos = s->off + (in - buf);
 226                                         return -3;
 227                                 }
 228
 229                                 switch (in[0])
 230                                 {
 231                                 case 'a': *out = '\a'; break;
 232                                 case 'b': *out = '\b'; break;
 233                                 case 'e': *out = '\e'; break;
 234                                 case 'f': *out = '\f'; break;
 235                                 case 'n': *out = '\n'; break;
 236                                 case 'r': *out = '\r'; break;
 237                                 case 't': *out = '\t'; break;
 238                                 case 'v': *out = '\v'; break;
 239                                 default:  *out = *in; break;
 240                                 }
 241
 242                                 in++;
 243                                 out++;
 244                         }
 245
 246                         esc = false;
 247                 }
 248
 249                 /* begin of escape sequence */
 250                 else if (*in == '\\')
 251                 {
 252                         in++;
 253                         esc = true;
 254                 }
 255
 256                 /* terminating quote */
 257                 else if (*in == q)
 258                 {
 259                         op->str = strdup(str);
 260                         return (in - buf) + 2;
 261                 }
 262
 263                 /* ordinary char */
 264                 else
 265                 {
 266                         if (rem-- < 1)
 267                         {
 268                                 s->error_pos = s->off + (in - buf);
 269                                 return -3;
 270                         }
 271
 272                         *out++ = *in++;
 273                 }
 274         }
 275
 276         return -1;
 277 }
 278
 279
 280 /*
 281  * Parses a label from the given buffer.
 282  *
 283  * Returns a negative value on error, otherwise the amount of consumed
 284  * characters from the given buffer.
 285  *
 286  * Error values:
 287  *  -3  Label too long
 288  */
 289
 290 static int
 291 parse_label(const char *buf, struct jp_opcode *op, struct jp_state *s)
 292 {
 293         char str[128] = { 0 };
 294         char *out = str;
 295         const char *in = buf;
 296         int rem = sizeof(str) - 1;
 297
 298         while (*in == '_' || isalnum(*in))
 299         {
 300                 if (rem-- < 1)
 301                 {
 302                         s->error_pos = s->off + (in - buf);
 303                         return -3;
 304                 }
 305
 306                 *out++ = *in++;
 307         }
 308
 309         if (!strcmp(str, "true") || !strcmp(str, "false"))
 310         {
 311                 op->num = (str[0] == 't');
 312                 op->type = T_BOOL;
 313         }
 314         else
 315         {
 316                 op->str = strdup(str);
 317         }
 318
 319         return (in - buf);
 320 }
 321
 322
 323 /*
 324  * Parses a number literal from the given buffer.
 325  *
 326  * Returns a negative value on error, otherwise the amount of consumed
 327  * characters from the given buffer.
 328  *
 329  * Error values:
 330  *  -2  Invalid number character
 331  */
 332
 333 static int
 334 parse_number(const char *buf, struct jp_opcode *op, struct jp_state *s)
 335 {
 336         char *e;
 337         int n = strtol(buf, &e, 10);
 338
 339         if (e == buf)
 340         {
 341                 s->error_pos = s->off;
 342                 return -2;
 343         }
 344
 345         op->num = n;
 346
 347         return (e - buf);
 348 }
 349
 350 static const struct token tokens[] = {
 351         { 0,                    " ",     1 },
 352         { 0,                    "\t",    1 },
 353         { 0,                    "\n",    1 },
 354         { T_LE,                 "<=",    2 },
 355         { T_GE,                 ">=",    2 },
 356         { T_NE,                 "!=",    2 },
 357         { T_AND,                "&&",    2 },
 358         { T_OR,                 "||",    2 },
 359         { T_DOT,                ".",     1 },
 360         { T_BROPEN,             "[",     1 },
 361         { T_BRCLOSE,    "]",     1 },
 362         { T_POPEN,              "(",     1 },
 363         { T_PCLOSE,             ")",     1 },
 364         { T_UNION,              ",",     1 },
 365         { T_ROOT,               "$",     1 },
 366         { T_THIS,               "@",     1 },
 367         { T_LT,                 "<",     1 },
 368         { T_GT,                 ">",     1 },
 369         { T_EQ,                 "=",     1 },
 370         { T_NOT,                "!",     1 },
 371         { T_WILDCARD,   "*",     1 },
 372         { T_STRING,             "'",     1, parse_string },
 373         { T_STRING,             "\"",    1, parse_string },
 374         { T_LABEL,              "_",     1, parse_label  },
 375         { T_LABEL,              "az",    0, parse_label  },
 376         { T_LABEL,              "AZ",    0, parse_label  },
 377         { T_NUMBER,             "-",     1, parse_number },
 378         { T_NUMBER,             "09",    0, parse_number },
 379 };
 380
 381 const char *tokennames[23] = {
 382         [0]                             = "End of file",
 383         [T_AND]                 = "'&&'",
 384         [T_OR]                  = "'||'",
 385         [T_UNION]               = "','",
 386         [T_EQ]                  = "'='",
 387         [T_NE]                  = "'!='",
 388         [T_GT]                  = "'>'",
 389         [T_GE]                  = "'>='",
 390         [T_LT]                  = "'<'",
 391         [T_LE]                  = "'<='",
 392         [T_NOT]                 = "'!'",
 393         [T_LABEL]               = "Label",
 394         [T_ROOT]                = "'$'",
 395         [T_THIS]                = "'@'",
 396         [T_DOT]                 = "'.'",
 397         [T_WILDCARD]    = "'*'",
 398         [T_BROPEN]              = "'['",
 399         [T_BRCLOSE]             = "']'",
 400         [T_BOOL]                = "Bool",
 401         [T_NUMBER]              = "Number",
 402         [T_STRING]              = "String",
 403         [T_POPEN]               = "'('",
 404         [T_PCLOSE]              = "')'",
 405 };
 406
 407
 408 static int
 409 match_token(const char *ptr, struct jp_opcode *op, struct jp_state *s)
 410 {
 411         int i;
 412         const struct token *tok;
 413
 414         for (i = 0, tok = &tokens[0];
 415              i < sizeof(tokens) / sizeof(tokens[0]);
 416                  i++, tok = &tokens[i])
 417         {
 418                 if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
 419                     (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1]))
 420                 {
 421                         op->type = tok->type;
 422
 423                         if (tok->parse)
 424                                 return tok->parse(ptr, op, s);
 425
 426                         return tok->plen;
 427                 }
 428         }
 429
 430         s->error_pos = s->off;
 431         return -4;
 432 }
 433
 434 struct jp_opcode *
 435 jp_get_token(struct jp_state *s, const char *input, int *mlen)
 436 {
 437         struct jp_opcode op = { 0 };
 438
 439         *mlen = match_token(input, &op, s);
 440
 441         if (*mlen < 0)
 442         {
 443                 s->error_code = *mlen;
 444                 return NULL;
 445         }
 446         else if (op.type == 0)
 447         {
 448                 return NULL;
 449         }
 450
 451         return jp_alloc_op(s, op.type, op.num, op.str, NULL);
 452 }