common/JSON_Tokenizer.cc

   1 #include <cstring>
   2
   3 #include "JSON_Tokenizer.hh"
   4 #include "memory.h"
   5 #include <cstdio>
   6
   7 static const char TABS[] =
   8   "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
   9   "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
  10   "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
  11   "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t";
  12 const size_t MAX_TABS = sizeof(TABS) - 1; // 64
  13
  14 void JSON_Tokenizer::init(const char* p_buf, const size_t p_buf_len)
  15 {
  16   if (p_buf != 0 && p_buf_len != 0) {
  17     buf_ptr = mcopystrn(p_buf, p_buf_len);
  18   } else {
  19     buf_ptr = 0;
  20   }
  21   buf_len = p_buf_len;
  22   buf_pos = 0;
  23   depth = 0;
  24   previous_token = JSON_TOKEN_NONE;
  25 }
  26
  27 JSON_Tokenizer::~JSON_Tokenizer()
  28 {
  29   Free(buf_ptr);
  30 }
  31
  32 void JSON_Tokenizer::put_c(const char c)
  33 {
  34   buf_ptr = mputprintf(buf_ptr, "%c", c);
  35   ++buf_len;
  36 }
  37
  38 void JSON_Tokenizer::put_s(const char* s)
  39 {
  40   buf_ptr = mputstr(buf_ptr, s);
  41   buf_len += strlen(s);
  42 }
  43
  44 void JSON_Tokenizer::put_depth()
  45 {
  46   put_s(TABS + ((depth > MAX_TABS) ? 0 : MAX_TABS - depth));
  47 }
  48
  49 bool JSON_Tokenizer::skip_white_spaces()
  50 {
  51   while(buf_pos < buf_len) {
  52     switch(buf_ptr[buf_pos]) {
  53     case ' ':
  54     case '\r':
  55     case '\n':
  56     case '\t':
  57     case '\f':
  58       ++buf_pos;
  59       break;
  60     default:
  61       return true;
  62     }
  63   }
  64   return false;
  65 }
  66
  67 bool JSON_Tokenizer::check_for_string()
  68 {
  69   if ('\"' == buf_ptr[buf_pos]) {
  70     ++buf_pos;
  71   } else {
  72     return false;
  73   }
  74   while (buf_pos < buf_len) {
  75     if ('\"' == buf_ptr[buf_pos]) {
  76       return true;
  77     }
  78     else if ('\\' == buf_ptr[buf_pos]) {
  79       // skip escaped character (so escaped quotes (\") are not mistaken for the ending quotes)
  80       ++buf_pos;
  81     }
  82     ++buf_pos;
  83   }
  84   return false;
  85 }
  86
  87 bool JSON_Tokenizer::check_for_number()
  88 {
  89   bool first_digit = false; // first non-zero digit reached
  90   bool zero = false; // first zero digit reached
  91   bool decimal_point = false; // decimal point (.) reached
  92   bool exponent_mark = false; // exponential mark (e or E) reached
  93   bool exponent_sign = false; // sign of the exponential (- or +) reached
  94
  95   if ('-' == buf_ptr[buf_pos]) {
  96     ++buf_pos;
  97   }
  98
  99   while (buf_pos < buf_len) {
 100     switch(buf_ptr[buf_pos]) {
 101     case '.':
 102       if (decimal_point || exponent_mark || (!first_digit && !zero)) {
 103         return false;
 104       }
 105       decimal_point = true;
 106       first_digit = false;
 107       zero = false;
 108       break;
 109     case 'e':
 110     case 'E':
 111       if (exponent_mark || (!first_digit && !zero)) {
 112         return false;
 113       }
 114       exponent_mark = true;
 115       first_digit = false;
 116       zero = false;
 117       break;
 118     case '0':
 119       if (!first_digit && (exponent_mark || (!decimal_point && zero))) {
 120         return false;
 121       }
 122       zero = true;
 123       break;
 124     case '1':
 125     case '2':
 126     case '3':
 127     case '4':
 128     case '5':
 129     case '6':
 130     case '7':
 131     case '8':
 132     case '9':
 133       if (!first_digit && zero && (!decimal_point || exponent_mark)) {
 134         return false;
 135       }
 136       first_digit = true;
 137       break;
 138     case '-':
 139     case '+':
 140       if (exponent_sign || !exponent_mark || zero || first_digit) {
 141         return false;
 142       }
 143       exponent_sign = true;
 144       break;
 145     default:
 146       return first_digit || zero;
 147     }
 148
 149     ++buf_pos;
 150   }
 151   return first_digit || zero;
 152 }
 153
 154 bool JSON_Tokenizer::check_for_separator()
 155 {
 156   if (buf_pos < buf_len) {
 157     switch(buf_ptr[buf_pos]) {
 158     case ',':
 159       ++buf_pos;
 160       // no break
 161     case ':':
 162     case '{':
 163     case '}':
 164     case '[':
 165     case ']':
 166       return true;
 167     default:
 168       return false;
 169     }
 170   }
 171   return true;
 172 }
 173
 174 bool JSON_Tokenizer::check_for_literal(const char* p_literal)
 175 {
 176   size_t len = strlen(p_literal);
 177   size_t start_pos = buf_pos;
 178
 179   if (buf_len - buf_pos >= len &&
 180       0 == strncmp(buf_ptr + buf_pos, p_literal, len)) {
 181     buf_pos += len;
 182     if (!skip_white_spaces() || check_for_separator()) {
 183       return true;
 184     } else {
 185       // must be followed by a separator (or only white spaces until the buffer ends) -> undo buffer action
 186       buf_pos = start_pos;
 187     }
 188   }
 189   return false;
 190 }
 191
 192 int JSON_Tokenizer::get_next_token(json_token_t* p_token, char** p_token_str, size_t* p_str_len)
 193 {
 194   size_t start_pos = buf_pos;
 195   *p_token = JSON_TOKEN_NONE;
 196   if (0 != p_token_str && 0 != p_str_len) {
 197     *p_token_str = 0;
 198     *p_str_len = 0;
 199   }
 200
 201   if (skip_white_spaces()) {
 202     char c = buf_ptr[buf_pos];
 203     switch (c) {
 204     case '{':
 205     case '[':
 206       *p_token = ('{' == c) ? JSON_TOKEN_OBJECT_START : JSON_TOKEN_ARRAY_START;
 207       ++buf_pos;
 208       break;
 209     case '}':
 210     case ']':
 211       ++buf_pos;
 212       if (skip_white_spaces() && !check_for_separator()) {
 213         // must be followed by a separator (or only white spaces until the buffer ends)
 214         *p_token = JSON_TOKEN_ERROR;
 215       } else {
 216         *p_token = ('}' == c) ? JSON_TOKEN_OBJECT_END : JSON_TOKEN_ARRAY_END;
 217       }
 218       break;
 219     case '\"': {
 220       // string value or field name
 221       size_t string_start_pos = buf_pos;
 222       if(!check_for_string()) {
 223         // invalid string value
 224         *p_token = JSON_TOKEN_ERROR;
 225         break;
 226       }
 227       size_t string_end_pos = ++buf_pos; // step over the string's ending quotes
 228       if (skip_white_spaces() && ':' == buf_ptr[buf_pos]) {
 229         // name token - don't include the starting and ending quotes
 230         *p_token = JSON_TOKEN_NAME;
 231         if (0 != p_token_str && 0 != p_str_len) {
 232           *p_token_str = buf_ptr + string_start_pos + 1;
 233           *p_str_len = string_end_pos - string_start_pos - 2;
 234         }
 235         ++buf_pos;
 236       } else if (check_for_separator()) {
 237         // value token - include the starting and ending quotes
 238         *p_token = JSON_TOKEN_STRING;
 239         if (0 != p_token_str && 0 != p_str_len) {
 240           *p_token_str = buf_ptr + string_start_pos;
 241           *p_str_len = string_end_pos - string_start_pos;
 242         }
 243       } else {
 244         // value token, but there is no separator after it -> error
 245         *p_token = JSON_TOKEN_ERROR;
 246         break;
 247       }
 248       break;
 249     } // case: string value or field name
 250     default:
 251       if (('0' <= buf_ptr[buf_pos] && '9' >= buf_ptr[buf_pos]) ||
 252           '-' == buf_ptr[buf_pos]) {
 253         // number value
 254         size_t number_start_pos = buf_pos;
 255         if (!check_for_number()) {
 256           // invalid number
 257           *p_token = JSON_TOKEN_ERROR;
 258           break;
 259         }
 260         size_t number_length = buf_pos - number_start_pos;
 261         if (skip_white_spaces() && !check_for_separator()) {
 262           // must be followed by a separator (or only white spaces until the buffer ends)
 263           *p_token = JSON_TOKEN_ERROR;
 264           break;
 265         }
 266         *p_token = JSON_TOKEN_NUMBER;
 267         if (0 != p_token_str && 0 != p_str_len) {
 268           *p_token_str = buf_ptr + number_start_pos;
 269           *p_str_len = number_length;
 270         }
 271         break;
 272       } // if (number value)
 273       else if (check_for_literal("true")) {
 274         *p_token = JSON_TOKEN_LITERAL_TRUE;
 275         break;
 276       }
 277       else if (check_for_literal("false")) {
 278         *p_token = JSON_TOKEN_LITERAL_FALSE;
 279         break;
 280       }
 281       else if (check_for_literal("null")) {
 282         *p_token = JSON_TOKEN_LITERAL_NULL;
 283         break;
 284       }
 285       else {
 286         *p_token = JSON_TOKEN_ERROR;
 287         break;
 288       }
 289     } // switch (current char)
 290   } // if (skip_white_spaces())
 291
 292   return buf_pos - start_pos;
 293 }
 294
 295 void JSON_Tokenizer::put_separator()
 296 {
 297   if (JSON_TOKEN_NAME != previous_token && JSON_TOKEN_NONE != previous_token &&
 298       JSON_TOKEN_ARRAY_START != previous_token && JSON_TOKEN_OBJECT_START != previous_token) {
 299     put_c(',');
 300     if (pretty) {
 301       put_c('\n');
 302       put_depth();
 303     }
 304   }
 305 }
 306
 307 int JSON_Tokenizer::put_next_token(json_token_t p_token, const char* p_token_str)
 308 {
 309   int start_len = buf_len;
 310   switch(p_token) {
 311   case JSON_TOKEN_OBJECT_START:
 312   case JSON_TOKEN_ARRAY_START: {
 313     put_separator();
 314     put_c( (JSON_TOKEN_OBJECT_START == p_token) ? '{' : '[' );
 315     if (pretty) {
 316       put_c('\n');
 317       ++depth;
 318       put_depth();
 319     }
 320     break;
 321   }
 322   case JSON_TOKEN_OBJECT_END:
 323   case JSON_TOKEN_ARRAY_END: {
 324     if (pretty) {
 325       if (JSON_TOKEN_OBJECT_START != previous_token && JSON_TOKEN_ARRAY_START != previous_token) {
 326         put_c('\n');
 327         --depth;
 328         put_depth();
 329       } else if (MAX_TABS >= depth) {
 330         // empty object or array -> remove the extra tab added at the start token
 331         --depth;
 332         --buf_len;
 333         buf_ptr[buf_len] = 0;
 334       }
 335     }
 336     put_c( (JSON_TOKEN_OBJECT_END == p_token) ? '}' : ']' );
 337     break;
 338   }
 339   case JSON_TOKEN_NUMBER:
 340   case JSON_TOKEN_STRING:
 341     put_separator();
 342     put_s(p_token_str);
 343     break;
 344   case JSON_TOKEN_LITERAL_TRUE:
 345     put_separator();
 346     put_s("true");
 347     break;
 348   case JSON_TOKEN_LITERAL_FALSE:
 349     put_separator();
 350     put_s("false");
 351     break;
 352   case JSON_TOKEN_LITERAL_NULL:
 353     put_separator();
 354     put_s("null");
 355     break;
 356   case JSON_TOKEN_NAME:
 357     put_separator();
 358     put_c('\"');
 359     put_s(p_token_str);
 360     if (pretty) {
 361       put_s("\" : ");
 362     } else {
 363       put_s("\":");
 364     }
 365     break;
 366   default:
 367     return 0;
 368   }
 369
 370   previous_token = p_token;
 371   return buf_len - start_len;
 372 }
 373