duk_codepage_conv.c 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. /*
  2. * Convert an 8-bit input string (e.g. ISO-8859-1) into CESU-8.
  3. * Calling code supplies the "code page" as a 256-entry array of
  4. * codepoints for the conversion.
  5. *
  6. * This is useful when input data is in non-UTF-8 format and must
  7. * be converted at runtime, e.g. when compiling non-UTF-8 source
  8. * code. Another alternative is to use e.g. iconv.
  9. */
  10. #include "duktape.h"
  11. /* Decode an 8-bit string using 'codepage' into Unicode codepoints and
  12. * re-encode into CESU-8. Codepage argument must point to a 256-entry
  13. * table. Only supports BMP (codepoints U+0000 to U+FFFF).
  14. */
  15. void duk_decode_string_codepage(duk_context *ctx, const char *str, size_t len, unsigned int *codepage) {
  16. unsigned char *tmp;
  17. size_t tmplen, i;
  18. unsigned char *p;
  19. unsigned int cp;
  20. tmplen = 3 * len; /* max expansion is 1 input byte -> 3 output bytes */
  21. if (tmplen / 3 != len) {
  22. /* Temporary buffer length wraps. */
  23. duk_error(ctx, DUK_ERR_RANGE_ERROR, "input string too long");
  24. return;
  25. }
  26. tmp = (unsigned char *) duk_push_fixed_buffer(ctx, tmplen);
  27. for (i = 0, p = tmp; i < len; i++) {
  28. cp = codepage[((unsigned char *) str)[i]] & 0xffffUL;
  29. if (cp < 0x80UL) {
  30. *p++ = (unsigned char) cp;
  31. } else if (cp < 0x800UL) {
  32. *p++ = (unsigned char) (0xc0 + ((cp >> 6) & 0x1f));
  33. *p++ = (unsigned char) (0x80 + (cp & 0x3f));
  34. } else {
  35. /* In CESU-8 all codepoints in [0x0000,0xFFFF] are
  36. * allowed, including surrogates.
  37. */
  38. *p++ = (unsigned char) (0xe0 + ((cp >> 12) & 0x0f));
  39. *p++ = (unsigned char) (0x80 + ((cp >> 6) & 0x3f));
  40. *p++ = (unsigned char) (0x80 + (cp & 0x3f));
  41. }
  42. }
  43. duk_push_lstring(ctx, (const char *) tmp, (duk_size_t) (p - tmp));
  44. /* [ ... tmp res ] */
  45. duk_remove(ctx, -2);
  46. }