|
@@ -30,6 +30,7 @@
|
|
|
#include <float.h>
|
|
|
#include <limits.h>
|
|
|
#include <ctype.h>
|
|
|
+#include <stdint.h>
|
|
|
#include "cJSON.h"
|
|
|
|
|
|
/* define our own boolean type */
|
|
@@ -403,201 +404,257 @@ static unsigned parse_hex4(const unsigned char *str)
|
|
|
return h;
|
|
|
}
|
|
|
|
|
|
-/* first bytes of UTF8 encoding for a given length in bytes */
|
|
|
-static const unsigned char firstByteMark[5] =
|
|
|
+/* converts a UTF-16 literal to UTF-8
|
|
|
+ * A literal can be one or two sequences of the form \uXXXX */
|
|
|
+static uint8_t utf16_literal_to_utf8(const unsigned char * const input_pointer, const unsigned char * const input_end, unsigned char **output_pointer, const unsigned char **error_pointer)
|
|
|
{
|
|
|
- 0x00, /* should never happen */
|
|
|
- 0x00, /* 0xxxxxxx */
|
|
|
- 0xC0, /* 110xxxxx */
|
|
|
- 0xE0, /* 1110xxxx */
|
|
|
- 0xF0 /* 11110xxx */
|
|
|
-};
|
|
|
+ /* first bytes of UTF8 encoding for a given length in bytes */
|
|
|
+ static const unsigned char firstByteMark[5] =
|
|
|
+ {
|
|
|
+ 0x00, /* should never happen */
|
|
|
+ 0x00, /* 0xxxxxxx */
|
|
|
+ 0xC0, /* 110xxxxx */
|
|
|
+ 0xE0, /* 1110xxxx */
|
|
|
+ 0xF0 /* 11110xxx */
|
|
|
+ };
|
|
|
|
|
|
-/* Parse the input text into an unescaped cstring, and populate item. */
|
|
|
-static const unsigned char *parse_string(cJSON *item, const unsigned char *str, const unsigned char **ep)
|
|
|
-{
|
|
|
- const unsigned char *ptr = str + 1;
|
|
|
- const unsigned char *end_ptr = str + 1;
|
|
|
- unsigned char *ptr2 = NULL;
|
|
|
- unsigned char *out = NULL;
|
|
|
- size_t len = 0;
|
|
|
- unsigned uc = 0;
|
|
|
- unsigned uc2 = 0;
|
|
|
+ long unsigned int codepoint = 0;
|
|
|
+ unsigned int first_code = 0;
|
|
|
+ const unsigned char *first_sequence = input_pointer;
|
|
|
+ uint8_t utf8_length = 0;
|
|
|
+ uint8_t sequence_length = 0;
|
|
|
|
|
|
- /* not a string! */
|
|
|
- if (*str != '\"')
|
|
|
+ /* get the first utf16 sequence */
|
|
|
+ first_code = parse_hex4(first_sequence + 2);
|
|
|
+ if ((input_end - first_sequence) < 6)
|
|
|
{
|
|
|
- *ep = str;
|
|
|
+ /* input ends unexpectedly */
|
|
|
+ *error_pointer = first_sequence;
|
|
|
goto fail;
|
|
|
}
|
|
|
|
|
|
- while ((*end_ptr != '\"') && *end_ptr)
|
|
|
+ /* check that the code is valid */
|
|
|
+ if (((first_code >= 0xDC00) && (first_code <= 0xDFFF)) || (first_code == 0))
|
|
|
{
|
|
|
- if (*end_ptr++ == '\\')
|
|
|
+ *error_pointer = first_sequence;
|
|
|
+ goto fail;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* UTF16 surrogate pair */
|
|
|
+ if ((first_code >= 0xD800) && (first_code <= 0xDBFF))
|
|
|
+ {
|
|
|
+ const unsigned char *second_sequence = first_sequence + 6;
|
|
|
+ unsigned int second_code = 0;
|
|
|
+ sequence_length = 12; /* \uXXXX\uXXXX */
|
|
|
+
|
|
|
+ if ((input_end - second_sequence) < 6)
|
|
|
{
|
|
|
- if (*end_ptr == '\0')
|
|
|
- {
|
|
|
- /* prevent buffer overflow when last input character is a backslash */
|
|
|
- goto fail;
|
|
|
- }
|
|
|
- /* Skip escaped quotes. */
|
|
|
- end_ptr++;
|
|
|
+ /* input ends unexpectedly */
|
|
|
+ *error_pointer = first_sequence;
|
|
|
+ goto fail;
|
|
|
+ }
|
|
|
+
|
|
|
+ if ((second_sequence[0] != '\\') || (second_sequence[1] != 'u'))
|
|
|
+ {
|
|
|
+ /* missing second half of the surrogate pair */
|
|
|
+ *error_pointer = first_sequence;
|
|
|
+ goto fail;
|
|
|
}
|
|
|
- len++;
|
|
|
+
|
|
|
+ /* get the second utf16 sequence */
|
|
|
+ second_code = parse_hex4(second_sequence + 2);
|
|
|
+ /* check that the code is valid */
|
|
|
+ if ((second_code < 0xDC00) || (second_code > 0xDFFF))
|
|
|
+ {
|
|
|
+ /* invalid second half of the surrogate pair */
|
|
|
+ *error_pointer = first_sequence;
|
|
|
+ goto fail;
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ /* calculate the unicode codepoint from the surrogate pair */
|
|
|
+ codepoint = 0x10000 + (((first_code & 0x3FF) << 10) | (second_code & 0x3FF));
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ sequence_length = 6; /* \uXXXX */
|
|
|
+ codepoint = first_code;
|
|
|
}
|
|
|
|
|
|
- /* This is at most how long we need for the string, roughly. */
|
|
|
- out = (unsigned char*)cJSON_malloc(len + 1);
|
|
|
- if (!out)
|
|
|
+ /* encode as UTF-8
|
|
|
+ * takes at maximum 4 bytes to encode:
|
|
|
+ * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
|
+ if (codepoint < 0x80)
|
|
|
+ {
|
|
|
+ /* normal ascii, encoding 0xxxxxxx */
|
|
|
+ utf8_length = 1;
|
|
|
+ }
|
|
|
+ else if (codepoint < 0x800)
|
|
|
+ {
|
|
|
+ /* two bytes, encoding 110xxxxx 10xxxxxx */
|
|
|
+ utf8_length = 2;
|
|
|
+ }
|
|
|
+ else if (codepoint < 0x10000)
|
|
|
+ {
|
|
|
+ /* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */
|
|
|
+ utf8_length = 3;
|
|
|
+ }
|
|
|
+ else if (codepoint <= 0x10FFFF)
|
|
|
{
|
|
|
+ /* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
|
+ utf8_length = 4;
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ /* invalid unicode codepoint */
|
|
|
+ *error_pointer = first_sequence;
|
|
|
goto fail;
|
|
|
}
|
|
|
|
|
|
- ptr2 = out;
|
|
|
+ /* encode as utf8 */
|
|
|
+ switch (utf8_length)
|
|
|
+ {
|
|
|
+ case 4:
|
|
|
+ /* 10xxxxxx */
|
|
|
+ (*output_pointer)[3] = (unsigned char)((codepoint | 0x80) & 0xBF);
|
|
|
+ codepoint >>= 6;
|
|
|
+ case 3:
|
|
|
+ /* 10xxxxxx */
|
|
|
+ (*output_pointer)[2] = (unsigned char)((codepoint | 0x80) & 0xBF);
|
|
|
+ codepoint >>= 6;
|
|
|
+ case 2:
|
|
|
+ (*output_pointer)[1] = (unsigned char)((codepoint | 0x80) & 0xBF);
|
|
|
+ codepoint >>= 6;
|
|
|
+ case 1:
|
|
|
+ /* depending on the length in bytes this determines the
|
|
|
+ encoding of the first UTF8 byte */
|
|
|
+ (*output_pointer)[0] = (unsigned char)((codepoint | firstByteMark[utf8_length]) & 0xFF);
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ *error_pointer = first_sequence;
|
|
|
+ goto fail;
|
|
|
+ }
|
|
|
+ *output_pointer += utf8_length;
|
|
|
+
|
|
|
+ return sequence_length;
|
|
|
+
|
|
|
+fail:
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/* Parse the input text into an unescaped cinput, and populate item. */
|
|
|
+static const unsigned char *parse_string(cJSON *item, const unsigned char *input, const unsigned char **error_pointer)
|
|
|
+{
|
|
|
+ const unsigned char *input_pointer = input + 1;
|
|
|
+ const unsigned char *input_end = input + 1;
|
|
|
+ unsigned char *output_pointer = NULL;
|
|
|
+ unsigned char *output = NULL;
|
|
|
+
|
|
|
+ /* not a string */
|
|
|
+ if (*input != '\"')
|
|
|
+ {
|
|
|
+ *error_pointer = input;
|
|
|
+ goto fail;
|
|
|
+ }
|
|
|
+
|
|
|
+ {
|
|
|
+ /* calculate approximate size of the output (overestimate) */
|
|
|
+ size_t allocation_length = 0;
|
|
|
+ size_t skipped_bytes = 0;
|
|
|
+ while ((*input_end != '\"') && (*input_end != '\0'))
|
|
|
+ {
|
|
|
+ /* is escape sequence */
|
|
|
+ if (input_end[0] == '\\')
|
|
|
+ {
|
|
|
+ if (input_end[1] == '\0')
|
|
|
+ {
|
|
|
+ /* prevent buffer overflow when last input character is a backslash */
|
|
|
+ goto fail;
|
|
|
+ }
|
|
|
+ skipped_bytes++;
|
|
|
+ input_end++;
|
|
|
+ }
|
|
|
+ input_end++;
|
|
|
+ }
|
|
|
+ if (*input_end == '\0')
|
|
|
+ {
|
|
|
+ goto fail; /* string ended unexpectedly */
|
|
|
+ }
|
|
|
+
|
|
|
+ /* This is at most how much we need for the output */
|
|
|
+ allocation_length = (size_t) (input_end - input) - skipped_bytes;
|
|
|
+ output = (unsigned char*)cJSON_malloc(allocation_length + sizeof('\0'));
|
|
|
+ if (output == NULL)
|
|
|
+ {
|
|
|
+ goto fail; /* allocation failure */
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ output_pointer = output;
|
|
|
/* loop through the string literal */
|
|
|
- while (ptr < end_ptr)
|
|
|
+ while (input_pointer < input_end)
|
|
|
{
|
|
|
- if (*ptr != '\\')
|
|
|
+ if (*input_pointer != '\\')
|
|
|
{
|
|
|
- *ptr2++ = *ptr++;
|
|
|
+ *output_pointer++ = *input_pointer++;
|
|
|
}
|
|
|
/* escape sequence */
|
|
|
else
|
|
|
{
|
|
|
- ptr++;
|
|
|
- switch (*ptr)
|
|
|
+ uint8_t sequence_length = 2;
|
|
|
+ switch (input_pointer[1])
|
|
|
{
|
|
|
case 'b':
|
|
|
- *ptr2++ = '\b';
|
|
|
+ *output_pointer++ = '\b';
|
|
|
break;
|
|
|
case 'f':
|
|
|
- *ptr2++ = '\f';
|
|
|
+ *output_pointer++ = '\f';
|
|
|
break;
|
|
|
case 'n':
|
|
|
- *ptr2++ = '\n';
|
|
|
+ *output_pointer++ = '\n';
|
|
|
break;
|
|
|
case 'r':
|
|
|
- *ptr2++ = '\r';
|
|
|
+ *output_pointer++ = '\r';
|
|
|
break;
|
|
|
case 't':
|
|
|
- *ptr2++ = '\t';
|
|
|
+ *output_pointer++ = '\t';
|
|
|
break;
|
|
|
case '\"':
|
|
|
case '\\':
|
|
|
case '/':
|
|
|
- *ptr2++ = *ptr;
|
|
|
+ *output_pointer++ = input_pointer[1];
|
|
|
break;
|
|
|
+
|
|
|
+ /* UTF-16 literal */
|
|
|
case 'u':
|
|
|
- /* transcode utf16 to utf8. See RFC2781 and RFC3629. */
|
|
|
- uc = parse_hex4(ptr + 1); /* get the unicode char. */
|
|
|
- ptr += 4;
|
|
|
- if (ptr >= end_ptr)
|
|
|
- {
|
|
|
- /* invalid */
|
|
|
- *ep = str;
|
|
|
- goto fail;
|
|
|
- }
|
|
|
- /* check for invalid. */
|
|
|
- if (((uc >= 0xDC00) && (uc <= 0xDFFF)) || (uc == 0))
|
|
|
+ sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer, error_pointer);
|
|
|
+ if (sequence_length == 0)
|
|
|
{
|
|
|
- *ep = str;
|
|
|
+ /* failed to convert UTF16-literal to UTF-8 */
|
|
|
goto fail;
|
|
|
}
|
|
|
-
|
|
|
- /* UTF16 surrogate pairs. */
|
|
|
- if ((uc >= 0xD800) && (uc<=0xDBFF))
|
|
|
- {
|
|
|
- if ((ptr + 6) > end_ptr)
|
|
|
- {
|
|
|
- /* invalid */
|
|
|
- *ep = str;
|
|
|
- goto fail;
|
|
|
- }
|
|
|
- if ((ptr[1] != '\\') || (ptr[2] != 'u'))
|
|
|
- {
|
|
|
- /* missing second-half of surrogate. */
|
|
|
- *ep = str;
|
|
|
- goto fail;
|
|
|
- }
|
|
|
- uc2 = parse_hex4(ptr + 3);
|
|
|
- ptr += 6; /* \uXXXX */
|
|
|
- if ((uc2 < 0xDC00) || (uc2 > 0xDFFF))
|
|
|
- {
|
|
|
- /* invalid second-half of surrogate. */
|
|
|
- *ep = str;
|
|
|
- goto fail;
|
|
|
- }
|
|
|
- /* calculate unicode codepoint from the surrogate pair */
|
|
|
- uc = 0x10000 + (((uc & 0x3FF) << 10) | (uc2 & 0x3FF));
|
|
|
- }
|
|
|
-
|
|
|
- /* encode as UTF8
|
|
|
- * takes at maximum 4 bytes to encode:
|
|
|
- * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
|
- len = 4;
|
|
|
- if (uc < 0x80)
|
|
|
- {
|
|
|
- /* normal ascii, encoding 0xxxxxxx */
|
|
|
- len = 1;
|
|
|
- }
|
|
|
- else if (uc < 0x800)
|
|
|
- {
|
|
|
- /* two bytes, encoding 110xxxxx 10xxxxxx */
|
|
|
- len = 2;
|
|
|
- }
|
|
|
- else if (uc < 0x10000)
|
|
|
- {
|
|
|
- /* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */
|
|
|
- len = 3;
|
|
|
- }
|
|
|
- ptr2 += len;
|
|
|
-
|
|
|
- switch (len) {
|
|
|
- case 4:
|
|
|
- /* 10xxxxxx */
|
|
|
- *--ptr2 = (unsigned char)((uc | 0x80) & 0xBF);
|
|
|
- uc >>= 6;
|
|
|
- case 3:
|
|
|
- /* 10xxxxxx */
|
|
|
- *--ptr2 = (unsigned char)((uc | 0x80) & 0xBF);
|
|
|
- uc >>= 6;
|
|
|
- case 2:
|
|
|
- /* 10xxxxxx */
|
|
|
- *--ptr2 = (unsigned char)((uc | 0x80) & 0xBF);
|
|
|
- uc >>= 6;
|
|
|
- case 1:
|
|
|
- /* depending on the length in bytes this determines the
|
|
|
- * encoding ofthe first UTF8 byte */
|
|
|
- *--ptr2 = (unsigned char)((uc | firstByteMark[len]) & 0xFF);
|
|
|
- break;
|
|
|
- default:
|
|
|
- *ep = str;
|
|
|
- goto fail;
|
|
|
- }
|
|
|
- ptr2 += len;
|
|
|
break;
|
|
|
+
|
|
|
default:
|
|
|
- *ep = str;
|
|
|
+ *error_pointer = input_pointer;
|
|
|
goto fail;
|
|
|
}
|
|
|
- ptr++;
|
|
|
+ input_pointer += sequence_length;
|
|
|
}
|
|
|
}
|
|
|
- *ptr2 = '\0';
|
|
|
- if (*ptr == '\"')
|
|
|
- {
|
|
|
- ptr++;
|
|
|
- }
|
|
|
+
|
|
|
+ /* zero terminate the output */
|
|
|
+ *output_pointer = '\0';
|
|
|
|
|
|
item->type = cJSON_String;
|
|
|
- item->valuestring = (char*)out;
|
|
|
+ item->valuestring = (char*)output;
|
|
|
|
|
|
- return ptr;
|
|
|
+ return input_end + 1;
|
|
|
|
|
|
fail:
|
|
|
- if (out != NULL)
|
|
|
+ if (output != NULL)
|
|
|
{
|
|
|
- cJSON_free(out);
|
|
|
+ cJSON_free(output);
|
|
|
}
|
|
|
|
|
|
return NULL;
|