42#define UTF8_MAX 0x7FFFFFFFu
43#define UTF8_MAXCP 0x10FFFFu
44#define iscont(p) ((*(p) & 0xC0) == 0x80)
45#define CAST(tp,expr) ((tp)(expr))
48# define LUA_QL(x) "'" x "'"
51static int utf8_invalid (utfint ch)
52{
return (ch > UTF8_MAXCP || (0xD800u <= ch && ch <= 0xDFFFu)); }
54static size_t utf8_encode (
char *buff, utfint x) {
56 lua_assert(x <= UTF8_MAX);
58 buff[UTF8_BUFFSZ - 1] = x & 0x7F;
62 buff[UTF8_BUFFSZ - (n++)] = 0x80 | (x & 0x3f);
66 buff[UTF8_BUFFSZ - n] = ((~mfb << 1) | x) & 0xFF;
71static const char *utf8_decode (
const char *s, utfint *val,
int strict) {
72 static const utfint limits[] =
73 {~0u, 0x80u, 0x800u, 0x10000u, 0x200000u, 0x4000000u};
74 unsigned int c = (
unsigned char)s[0];
80 for (;
c & 0x40;
c <<= 1) {
81 unsigned int cc = (
unsigned char)s[++count];
82 if ((cc & 0xC0) != 0x80)
84 res = (res << 6) | (cc & 0x3F);
86 res |= ((utfint)(
c & 0x7F) << (count * 5));
87 if (count > 5 || res > UTF8_MAX || res < limits[count])
93 if (res > UTF8_MAXCP || (0xD800u <= res && res <= 0xDFFFu))
100static const char *utf8_prev (
const char *s,
const char *e) {
101 while (s < e && iscont(e - 1)) --e;
102 return s < e ? e - 1 : s;
105static const char *utf8_next (
const char *s,
const char *e) {
106 while (s < e && iscont(s + 1)) ++s;
107 return s < e ? s + 1 : e;
110static size_t utf8_length (
const char *s,
const char *e) {
112 for (i = 0; s < e; ++i)
117static const char *utf8_offset (
const char *s,
const char *e, lua_Integer offset, lua_Integer idx) {
118 const char *p = s + offset - 1;
120 while (p < e && idx > 0)
121 p = utf8_next(p, e), --idx;
122 return idx == 0 ? p : NULL;
124 while (s < p && idx < 0)
125 p = utf8_prev(s, p), ++idx;
126 return idx == 0 ? p : NULL;
130static const char *utf8_relat (
const char *s,
const char *e,
int idx) {
132 utf8_offset(s, e, 1, idx - 1) :
133 utf8_offset(s, e, e-s+1, idx);
136static int utf8_range(
const char *s,
const char *e, lua_Integer *i, lua_Integer *j) {
137 const char *ps = utf8_relat(s, e, CAST(
int, *i));
138 const char *pe = utf8_relat(s, e, CAST(
int, *j));
139 *i = (ps ? ps : (*i > 0 ? e : s)) - s;
140 *j = (pe ? utf8_next(pe, e) : (*j > 0 ? e : s)) - s;
145static uint8_t utf8_code_unit_len[] = {
146 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 3, 4
150static const char *utf8_invalid_offset(
const char *s,
const char *e) {
160 uint8_t needed_bytes = utf8_code_unit_len[
c >> 4];
161 if (e - s < needed_bytes)
164 if ((c2 & 0xC0) != 0x80)
166 if (needed_bytes >= 3) {
168 if ((c3 & 0xC0) != 0x80)
170 if (needed_bytes == 3) {
171 if (
c == 0xE0 && c2 < 0xA0)
173 if (
c == 0xED && c2 >= 0xA0)
177 if ((c4 & 0xC0) != 0x80)
179 if (
c == 0xF0 && c2 < 0x90)
181 if (
c == 0xF4 && c2 >= 0x90)
195#define table_size(t) (sizeof(t)/sizeof((t)[0]))
197#define utf8_categories(X) \
208#define utf8_converters(X) \
214static int find_in_range (
range_table *t,
size_t size, utfint ch) {
220 while (begin < end) {
221 size_t mid = (begin + end) / 2;
222 if (t[mid].last < ch)
224 else if (t[mid].first > ch)
227 return (ch - t[mid].first) % t[mid].step == 0;
233static int convert_char (
conv_table *t,
size_t size, utfint ch) {
239 while (begin < end) {
240 size_t mid = (begin + end) / 2;
241 if (t[mid].last < ch)
243 else if (t[mid].first > ch)
245 else if ((ch - t[mid].first) % t[mid].step == 0)
246 return ch + t[mid].offset;
256static int lookup_canon_cls (utfint ch) {
261 size_t begin = 0, end = table_size(nfc_combining_table);
263 while (begin < end) {
264 size_t mid = (begin + end) / 2;
265 if (nfc_combining_table[mid].last < ch)
267 else if (nfc_combining_table[mid].first > ch)
270 return nfc_combining_table[mid].canon_cls;
276static nfc_table *nfc_quickcheck (utfint ch) {
282 size_t begin = 0, end = table_size(nfc_quickcheck_table);
284 while (begin < end) {
285 size_t mid = (begin + end) / 2;
286 utfint found = nfc_quickcheck_table[mid].cp;
292 return &nfc_quickcheck_table[mid];
298static int nfc_combine (utfint cp1, utfint cp2, utfint *dest) {
299 size_t begin = 0, end = table_size(nfc_composite_table);
300 unsigned int hash = (cp1 * 213) + cp2;
302 while (begin < end) {
303 size_t mid = (begin + end) / 2;
304 utfint val = nfc_composite_table[mid].hash;
307 }
else if (val > hash) {
309 }
else if (nfc_composite_table[mid].cp1 == cp1 && nfc_composite_table[mid].cp2 == cp2) {
311 *dest = nfc_composite_table[mid].dest;
322 size_t begin = 0, end = table_size(nfc_decompose_table);
324 while (begin < end) {
325 size_t mid = (begin + end) / 2;
326 utfint found = nfc_decompose_table[mid].cp;
332 return &nfc_decompose_table[mid];
338static int nfc_check (utfint ch,
nfc_table *entry, utfint starter,
unsigned int canon_cls,
unsigned int prev_canon_cls) {
339 int reason = entry->reason;
341 if (reason == REASON_MUST_CONVERT_1 || reason == REASON_MUST_CONVERT_2) {
344 }
else if (reason == REASON_STARTER_CAN_COMBINE) {
347 if (!prev_canon_cls && nfc_combine(starter, ch, NULL)) {
351 }
else if (reason == REASON_COMBINING_MARK) {
353 if (canon_cls <= prev_canon_cls) {
356 if (nfc_combine(starter, ch, NULL)) {
364 if (decomp->canon_cls2 > canon_cls && nfc_combine(decomp->to1, ch, NULL)) {
368 if (decomp2 && decomp2->canon_cls2 > canon_cls && nfc_combine(decomp2->to1, ch, NULL)) {
373 }
else if (reason == REASON_JAMO_VOWEL) {
374 if (!prev_canon_cls && starter >= 0x1100 && starter <= 0x1112) {
378 }
else if (reason == REASON_JAMO_TRAILING) {
379 if (!prev_canon_cls && starter >= 0xAC00 && starter <= 0xD7A3) {
381 if ((starter - 0xAC00) % 28 == 0) {
391static void merge_combining_marks (uint32_t *src1, uint32_t *src2, uint32_t *dest,
size_t size1,
size_t size2) {
392 while (size1 && size2) {
393 if ((*src1 & 0xFF) > (*src2 & 0xFF)) {
411static void stable_sort_combining_marks (uint32_t *vector, uint32_t *scratch,
size_t size) {
414 size_t limit = size - 1;
415 for (
unsigned int i = 0; i < limit; i += 2) {
416 if ((vector[i] & 0xFF) > (vector[i+1] & 0xFF)) {
417 uint32_t temp = vector[i];
418 vector[i] = vector[i+1];
425 uint32_t *src = vector, *dest = scratch;
426 unsigned int runsize = 2;
427 while (runsize < size) {
428 unsigned int blocksize = runsize * 2;
429 limit = size & ~(blocksize - 1);
430 for (
unsigned int i = 0; i < limit; i += blocksize)
431 merge_combining_marks(&src[i], &src[i+runsize], &dest[i], runsize, runsize);
432 if (size - limit > runsize) {
433 merge_combining_marks(&src[limit], &src[limit+runsize], &dest[limit], runsize, size - limit - runsize);
435 memcpy(&dest[limit], &src[limit], (size - limit) *
sizeof(uint32_t));
439 uint32_t *temp = src; src = dest; dest = temp;
443 if (dest == vector) {
447 memcpy(vector, scratch, size *
sizeof(uint32_t));
452static void stable_insert_combining_mark (uint32_t *vector,
size_t vec_size,
unsigned int i)
454 unsigned int item = vector[i];
455 unsigned int canon_cls = item & 0xFF;
457 if (canon_cls < (vector[i-1] & 0xFF)) {
459 vector[i] = vector[i-1];
461 }
while (i > 0 && canon_cls < (vector[i-1] & 0xFF));
466 if (i < vec_size-1) {
467 if (canon_cls > (vector[i+1] & 0xFF)) {
469 vector[i] = vector[i+1];
471 }
while (i < vec_size-1 && canon_cls > (vector[i+1] & 0xFF));
478static void add_utf8char (luaL_Buffer *b, utfint ch);
480static inline void grow_vector_if_needed (uint32_t **vector, uint32_t *onstack,
size_t *size,
size_t needed)
482 size_t current_size = *size;
483 if (needed >= current_size) {
484 size_t new_size = current_size * 2;
485 uint32_t *new_vector = malloc(new_size *
sizeof(uint32_t));
486 memcpy(new_vector, *vector, current_size *
sizeof(uint32_t));
488 if (*vector != onstack)
490 *vector = new_vector;
494static void string_to_nfc (lua_State *L, luaL_Buffer *buff,
const char *s,
const char *e)
520 utfint starter = -1, ch;
521 const char *to_copy = s;
522 unsigned int prev_canon_cls = 0, canon_cls = 0;
529 size_t vec_size = 0, vec_max =
sizeof(onstack)/
sizeof(uint32_t);
530 uint32_t *vector = onstack;
533 const char *new_s = utf8_decode(s, &ch, 1);
535 if (vector != onstack)
537 lua_pushstring(L,
"string is not valid UTF-8");
540 unsigned int canon_cls = lookup_canon_cls(ch);
549 if (entry && entry->reason == REASON_MUST_CONVERT_2) {
550 utfint conv1 = entry->data1;
551 unsigned int canon_cls1 = lookup_canon_cls(conv1);
553 utfint conv2 = entry->data2;
554 unsigned int canon_cls2 = lookup_canon_cls(conv2);
555 grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 2);
556 vector[vec_size++] = (conv1 << 8) | (canon_cls1 & 0xFF);
557 vector[vec_size++] = (conv2 << 8) | (canon_cls2 & 0xFF);
559 prev_canon_cls = canon_cls2;
566 if (prev_canon_cls) {
569process_combining_marks:
572 for (
unsigned int i = 1; i < vec_size; i++) {
573 if ((vector[i-1] & 0xFF) > (vector[i] & 0xFF)) {
575 uint32_t *scratch = malloc(vec_size *
sizeof(uint32_t));
576 stable_sort_combining_marks(vector, scratch, vec_size);
585 while (i < vec_size) {
586 utfint combine_mark = vector[i] >> 8;
587 nfc_table *mark_entry = nfc_quickcheck(combine_mark);
589 if (mark_entry->reason == REASON_MUST_CONVERT_1) {
591 vector[i] = (mark_entry->data1 << 8) | mark_entry->data2;
594 }
else if (mark_entry->reason == REASON_MUST_CONVERT_2) {
596 grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1);
597 memmove(&vector[i+2], &vector[i+1],
sizeof(uint32_t) * (vec_size - i - 1));
598 vector[i] = (mark_entry->data1 << 8) | lookup_canon_cls(mark_entry->data1);
599 vector[i+1] = (mark_entry->data2 << 8) | lookup_canon_cls(mark_entry->data2);
603 }
else if (mark_entry->reason == REASON_COMBINING_MARK) {
604 unsigned int mark_canon_cls = vector[i] & 0xFF;
605 if (i == 0 || mark_canon_cls > (vector[i-1] & 0xFF)) {
606 if (nfc_combine(starter, combine_mark, &starter)) {
609 memmove(&vector[i], &vector[i+1],
sizeof(uint32_t) * (vec_size - i));
616 if (decomp->canon_cls2 > mark_canon_cls && nfc_combine(decomp->to1, combine_mark, &starter)) {
622 unsigned int class2 = lookup_canon_cls(decomp->to2);
623 memmove(&vector[1], &vector[0],
sizeof(uint32_t) * i);
624 vector[0] = (decomp->to2 << 8) | class2;
625 stable_insert_combining_mark(vector, vec_size, 0);
630 if (decomp2 && decomp2->canon_cls2 > mark_canon_cls && nfc_combine(decomp2->to1, combine_mark, &starter)) {
631 grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1);
632 memmove(&vector[i+2], &vector[i+1],
sizeof(uint32_t) * (vec_size - i - 1));
633 memmove(&vector[2], &vector[0],
sizeof(uint32_t) * i);
634 vector[0] = (decomp2->to2 << 8) | lookup_canon_cls(decomp2->to2);
635 vector[1] = (decomp->to2 << 8) | lookup_canon_cls(decomp->to2);
637 stable_insert_combining_mark(vector, vec_size, 1);
638 stable_insert_combining_mark(vector, vec_size, 0);
654 add_utf8char(buff, starter);
655 for (
unsigned int i = 0; i < vec_size; i++)
656 add_utf8char(buff, vector[i] >> 8);
659 luaL_addlstring(buff, to_copy, s - to_copy);
664 if (vector != onstack)
670 }
else if (starter != -1) {
675 if (entry->reason == REASON_STARTER_CAN_COMBINE && nfc_combine(starter, ch, &ch)) {
677 }
else if (entry->reason == REASON_JAMO_VOWEL && starter >= 0x1100 && starter <= 0x1112) {
678 ch = 0xAC00 + ((starter - 0x1100) * 588) + ((ch - 0x1161) * 28);
680 }
else if (entry->reason == REASON_JAMO_TRAILING) {
681 if (starter >= 0xAC00 && starter <= 0xD7A3 && (starter - 0xAC00) % 28 == 0) {
682 ch = starter + ch - 0x11A7;
688 add_utf8char(buff, starter);
696 if (entry->reason == REASON_MUST_CONVERT_1) {
697 starter = entry->data1;
699 }
else if (entry->reason == REASON_MUST_CONVERT_2) {
700 utfint conv1 = entry->data1;
701 utfint conv2 = entry->data2;
704 unsigned int canon_cls2 = lookup_canon_cls(conv2);
708 nfc_table *conv_entry = nfc_quickcheck(conv1);
709 if (conv_entry && conv_entry->reason == REASON_MUST_CONVERT_2) {
710 utfint conv3 = conv2;
711 unsigned int canon_cls3 = canon_cls2;
712 conv1 = conv_entry->data1;
713 conv2 = conv_entry->data2;
714 canon_cls2 = lookup_canon_cls(conv2);
717 vector[0] = (conv2 << 8) | canon_cls2;
718 vector[1] = (conv3 << 8) | canon_cls3;
721 add_utf8char(buff, conv1);
723 vector[0] = (conv3 << 8) | canon_cls3;
726 canon_cls = canon_cls3;
729 vector[0] = (conv2 << 8) | canon_cls2;
731 canon_cls = canon_cls2;
734 add_utf8char(buff, conv1);
742 grow_vector_if_needed(&vector, onstack, &vec_max, vec_size + 1);
743 vector[vec_size++] = (ch << 8) | (canon_cls & 0xFF);
747 prev_canon_cls = canon_cls;
751 goto process_combining_marks;
753 add_utf8char(buff, starter);
755 if (vector != onstack)
761static int hangul_type (utfint ch) {
766 size_t begin = 0, end = table_size(hangul_table);
768 while (begin < end) {
769 size_t mid = (begin + end) / 2;
770 if (hangul_table[mid].last < ch)
772 else if (hangul_table[mid].first > ch)
775 return hangul_table[mid].type;
781static int indic_conjunct_type (utfint ch) {
786 size_t begin = 0, end = table_size(indic_table);
788 while (begin < end) {
789 size_t mid = (begin + end) / 2;
790 if (indic_table[mid].last < ch)
792 else if (indic_table[mid].first > ch)
795 return indic_table[mid].type;
801#define define_category(cls, name) static int utf8_is##name (utfint ch)\
802{ return find_in_range(name##_table, table_size(name##_table), ch); }
803#define define_converter(name) static utfint utf8_to##name (utfint ch) \
804{ return convert_char(to##name##_table, table_size(to##name##_table), ch); }
805utf8_categories(define_category)
806utf8_converters(define_converter)
807#undef define_category
808#undef define_converter
810static int utf8_isgraph (utfint ch) {
811 if (find_in_range(space_table, table_size(space_table), ch))
813 if (find_in_range(graph_table, table_size(graph_table), ch))
815 if (find_in_range(compose_table, table_size(compose_table), ch))
820static int utf8_isalnum (utfint ch) {
821 if (find_in_range(alpha_table, table_size(alpha_table), ch))
823 if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch))
828static int utf8_width (utfint ch,
int ambi_is_single) {
829 if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch))
831 if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch))
832 return ambi_is_single ? 1 : 2;
833 if (find_in_range(compose_table, table_size(compose_table), ch))
835 if (find_in_range(unprintable_table, table_size(unprintable_table), ch))
842static int typeerror (lua_State *L,
int idx,
const char *tname)
843{
return luaL_error(L,
"%s expected, got %s", tname, luaL_typename(L, idx)); }
845static const char *check_utf8 (lua_State *L,
int idx,
const char **end) {
847 const char *s = luaL_checklstring(L, idx, &len);
848 if (end) *end = s+len;
852static const char *to_utf8 (lua_State *L,
int idx,
const char **end) {
854 const char *s = lua_tolstring(L, idx, &len);
855 if (end) *end = s+len;
859static const char *utf8_safe_decode (lua_State *L,
const char *p, utfint *pval) {
860 p = utf8_decode(p, pval, 0);
861 if (p == NULL) luaL_error(L,
"invalid UTF-8 code");
865static void add_utf8char (luaL_Buffer *b, utfint ch) {
866 char buff[UTF8_BUFFSZ];
867 size_t n = utf8_encode(buff, ch);
868 luaL_addlstring(b, buff+UTF8_BUFFSZ-n, n);
871static lua_Integer byte_relat (lua_Integer pos,
size_t len) {
872 if (pos >= 0)
return pos;
873 else if (0u - (
size_t)pos > len)
return 0;
874 else return (lua_Integer)len + pos + 1;
877static int Lutf8_len (lua_State *L) {
879 const char *s = luaL_checklstring(L, 1, &len), *p, *e;
880 lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
881 lua_Integer pose = byte_relat(luaL_optinteger(L, 3, -1), len);
882 int lax = lua_toboolean(L, 4);
883 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
884 "initial position out of string");
885 luaL_argcheck(L, --pose < (lua_Integer)len, 3,
886 "final position out of string");
887 for (n = 0, p=s+posi, e=s+pose+1; p < e; ++n) {
892 const char *np = utf8_decode(p, &ch, !lax);
893 if (np == NULL || utf8_invalid(ch)) {
895 lua_pushinteger(L, p - s + 1);
901 lua_pushinteger(L, n);
905static int Lutf8_sub (lua_State *L) {
906 const char *e, *s = check_utf8(L, 1, &e);
907 lua_Integer posi = luaL_checkinteger(L, 2);
908 lua_Integer pose = luaL_optinteger(L, 3, -1);
909 if (utf8_range(s, e, &posi, &pose))
910 lua_pushlstring(L, s+posi, pose-posi);
912 lua_pushliteral(L,
"");
916static int Lutf8_reverse (lua_State *L) {
918 const char *prev, *pprev, *ends, *e, *s = check_utf8(L, 1, &e);
920 int lax = lua_toboolean(L, 2);
921 luaL_buffinit(L, &b);
923 for (prev = e; s < prev; e = prev) {
924 prev = utf8_prev(s, prev);
925 luaL_addlstring(&b, prev, e-prev);
928 for (prev = e; s < prev; prev = pprev) {
930 ends = utf8_safe_decode(L, pprev = utf8_prev(s, prev), &code);
931 assert(ends == prev);
932 if (utf8_invalid(code))
933 return luaL_error(L,
"invalid UTF-8 code");
934 if (!utf8_iscompose(code)) {
935 luaL_addlstring(&b, pprev, e-pprev);
944static int Lutf8_byte (lua_State *L) {
946 const char *e, *s = check_utf8(L, 1, &e);
947 lua_Integer posi = luaL_optinteger(L, 2, 1);
948 lua_Integer pose = luaL_optinteger(L, 3, posi);
949 if (utf8_range(s, e, &posi, &pose)) {
950 for (e = s + pose, s = s + posi; s < e; ++n) {
952 s = utf8_safe_decode(L, s, &ch);
953 lua_pushinteger(L, ch);
959static int Lutf8_codepoint (lua_State *L) {
960 const char *e, *s = check_utf8(L, 1, &e);
962 lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
963 lua_Integer pose = byte_relat(luaL_optinteger(L, 3, posi), len);
964 int lax = lua_toboolean(L, 4);
967 luaL_argcheck(L, posi >= 1, 2,
"out of range");
968 luaL_argcheck(L, pose <= (lua_Integer)len, 3,
"out of range");
969 if (posi > pose)
return 0;
970 if (pose - posi >= INT_MAX)
971 return luaL_error(L,
"string slice too long");
972 n = (int)(pose - posi + 1);
973 luaL_checkstack(L, n,
"string slice too long");
976 for (n = 0, s += posi - 1; s < se;) {
978 s = utf8_safe_decode(L, s, &code);
979 if (!lax && utf8_invalid(code))
980 return luaL_error(L,
"invalid UTF-8 code");
981 lua_pushinteger(L, code);
987static int Lutf8_char (lua_State *L) {
988 int i, n = lua_gettop(L);
990 luaL_buffinit(L, &b);
991 for (i = 1; i <= n; ++i) {
992 lua_Integer code = luaL_checkinteger(L, i);
993 luaL_argcheck(L, code <= UTF8_MAXCP, i,
"value out of range");
994 add_utf8char(&b, CAST(utfint, code));
1000#define bind_converter(name) \
1001static int Lutf8_##name (lua_State *L) { \
1002 int t = lua_type(L, 1); \
1003 if (t == LUA_TNUMBER) \
1004 lua_pushinteger(L, utf8_to##name(CAST(utfint, lua_tointeger(L, 1)))); \
1005 else if (t == LUA_TSTRING) { \
1007 const char *e, *s = to_utf8(L, 1, &e); \
1008 luaL_buffinit(L, &b); \
1011 s = utf8_safe_decode(L, s, &ch); \
1012 add_utf8char(&b, utf8_to##name(ch)); \
1014 luaL_pushresult(&b); \
1016 else return typeerror(L, 1, "number/string"); \
1019utf8_converters(bind_converter)
1020#undef bind_converter
1025static const char *parse_escape (lua_State *L,
const char *s,
const char *e,
int hex, utfint *pch) {
1028 if (*s ==
'{') ++s, in_bracket = 1;
1029 for (; s < e; ++s) {
1030 utfint ch = (
unsigned char)*s;
1031 if (ch >=
'0' && ch <=
'9') ch = ch -
'0';
1032 else if (hex && ch >=
'A' && ch <=
'F') ch = 10 + (ch -
'A');
1033 else if (hex && ch >=
'a' && ch <=
'f') ch = 10 + (ch -
'a');
1034 else if (!in_bracket)
break;
1035 else if (ch ==
'}') { ++s;
break; }
1036 else luaL_error(L,
"invalid escape '%c'", ch);
1037 code *= hex ? 16 : 10;
1044static int Lutf8_escape (lua_State *L) {
1045 const char *e, *s = check_utf8(L, 1, &e);
1047 luaL_buffinit(L, &b);
1050 s = utf8_safe_decode(L, s, &ch);
1054 case '0':
case '1':
case '2':
case '3':
1055 case '4':
case '5':
case '6':
case '7':
1056 case '8':
case '9':
case '{':
1058 case 'x':
case 'X': hex = 1;
1059 case 'u':
case 'U':
if (s+1 < e) { ++s;
break; }
1062 s = utf8_safe_decode(L, s, &ch);
1065 s = parse_escape(L, s, e, hex, &ch);
1068 add_utf8char(&b, ch);
1070 luaL_pushresult(&b);
1074static int Lutf8_insert (lua_State *L) {
1075 const char *e, *s = check_utf8(L, 1, &e);
1080 const char *first = e;
1081 if (lua_type(L, 2) == LUA_TNUMBER) {
1082 int idx = (int)lua_tointeger(L, 2);
1083 if (idx != 0) first = utf8_relat(s, e, idx);
1084 luaL_argcheck(L, first, 2,
"invalid index");
1087 subs = luaL_checklstring(L, nargs, &sublen);
1088 luaL_buffinit(L, &b);
1089 luaL_addlstring(&b, s, first-s);
1090 luaL_addlstring(&b, subs, sublen);
1091 luaL_addlstring(&b, first, e-first);
1092 luaL_pushresult(&b);
1096static int Lutf8_remove (lua_State *L) {
1097 const char *e, *s = check_utf8(L, 1, &e);
1098 lua_Integer posi = luaL_optinteger(L, 2, -1);
1099 lua_Integer pose = luaL_optinteger(L, 3, -1);
1100 if (!utf8_range(s, e, &posi, &pose))
1104 luaL_buffinit(L, &b);
1105 luaL_addlstring(&b, s, posi);
1106 luaL_addlstring(&b, s+pose, e-s-pose);
1107 luaL_pushresult(&b);
1112static int push_offset (lua_State *L,
const char *s,
const char *e, lua_Integer offset, lua_Integer idx) {
1116 p = utf8_offset(s, e, offset, idx);
1117 else if (p = s+offset-1, iscont(p))
1118 p = utf8_prev(s, p);
1119 if (p == NULL || p == e)
return 0;
1120 utf8_decode(p, &ch, 0);
1121 lua_pushinteger(L, p-s+1);
1122 lua_pushinteger(L, ch);
1126static int Lutf8_charpos (lua_State *L) {
1127 const char *e, *s = check_utf8(L, 1, &e);
1128 lua_Integer offset = 1;
1129 if (lua_isnoneornil(L, 3)) {
1130 lua_Integer idx = luaL_optinteger(L, 2, 0);
1132 else if (idx < 0) offset = e-s+1;
1133 return push_offset(L, s, e, offset, idx);
1135 offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
1136 if (offset < 1) offset = 1;
1137 return push_offset(L, s, e, offset, luaL_checkinteger(L, 3));
1140static int Lutf8_offset (lua_State *L) {
1142 const char *s = luaL_checklstring(L, 1, &len);
1143 lua_Integer n = luaL_checkinteger(L, 2);
1144 lua_Integer posi = (n >= 0) ? 1 : len + 1;
1145 posi = byte_relat(luaL_optinteger(L, 3, posi), len);
1146 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
1147 "position out of range");
1150 while (posi > 0 && iscont(s + posi)) posi--;
1152 if (iscont(s + posi))
1153 return luaL_error(L,
"initial position is a continuation byte");
1155 while (n < 0 && posi > 0) {
1158 }
while (posi > 0 && iscont(s + posi));
1163 while (n > 0 && posi < (lua_Integer)len) {
1166 }
while (iscont(s + posi));
1172 lua_pushinteger(L, posi + 1);
1178static int Lutf8_next (lua_State *L) {
1179 const char *e, *s = check_utf8(L, 1, &e);
1180 lua_Integer offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
1181 lua_Integer idx = luaL_optinteger(L, 3, !lua_isnoneornil(L, 2));
1182 return push_offset(L, s, e, offset, idx);
1185static int iter_aux (lua_State *L,
int strict) {
1186 const char *e, *s = check_utf8(L, 1, &e);
1187 int n = CAST(
int, lua_tointeger(L, 2));
1188 const char *p = n <= 0 ? s : utf8_next(s+n-1, e);
1191 utf8_safe_decode(L, p, &code);
1192 if (strict && utf8_invalid(code))
1193 return luaL_error(L,
"invalid UTF-8 code");
1194 lua_pushinteger(L, p-s+1);
1195 lua_pushinteger(L, code);
1201static int iter_auxstrict (lua_State *L) {
return iter_aux(L, 1); }
1202static int iter_auxlax (lua_State *L) {
return iter_aux(L, 0); }
1204static int Lutf8_codes (lua_State *L) {
1205 int lax = lua_toboolean(L, 2);
1206 luaL_checkstring(L, 1);
1207 lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
1208 lua_pushvalue(L, 1);
1209 lua_pushinteger(L, 0);
1213static int Lutf8_width (lua_State *L) {
1214 int t = lua_type(L, 1);
1215 int ambi_is_single = !lua_toboolean(L, 2);
1216 int default_width = CAST(
int, luaL_optinteger(L, 3, 0));
1217 if (t == LUA_TNUMBER) {
1218 size_t chwidth = utf8_width(CAST(utfint, lua_tointeger(L, 1)), ambi_is_single);
1219 if (chwidth == 0) chwidth = default_width;
1220 lua_pushinteger(L, (lua_Integer)chwidth);
1221 }
else if (t != LUA_TSTRING)
1222 return typeerror(L, 1,
"number/string");
1224 const char *e, *s = to_utf8(L, 1, &e);
1229 s = utf8_safe_decode(L, s, &ch);
1230 chwidth = utf8_width(ch, ambi_is_single);
1231 width += chwidth == 0 ? default_width : chwidth;
1233 lua_pushinteger(L, (lua_Integer)width);
1238static int Lutf8_widthindex (lua_State *L) {
1239 const char *e, *s = check_utf8(L, 1, &e);
1240 int width = CAST(
int, luaL_checkinteger(L, 2));
1241 int ambi_is_single = !lua_toboolean(L, 3);
1242 int default_width = CAST(
int, luaL_optinteger(L, 4, 0));
1247 s = utf8_safe_decode(L, s, &ch);
1248 chwidth = utf8_width(ch, ambi_is_single);
1249 if (chwidth == 0) chwidth = default_width;
1250 width -= CAST(
int, chwidth);
1252 lua_pushinteger(L, idx);
1253 lua_pushinteger(L, width + chwidth);
1254 lua_pushinteger(L, chwidth);
1259 lua_pushinteger(L, (lua_Integer)idx);
1263static int Lutf8_ncasecmp (lua_State *L) {
1264 const char *e1, *s1 = check_utf8(L, 1, &e1);
1265 const char *e2, *s2 = check_utf8(L, 2, &e2);
1266 while (s1 < e1 || s2 < e2) {
1267 utfint ch1 = 0, ch2 = 0;
1273 s1 = utf8_safe_decode(L, s1, &ch1);
1274 s2 = utf8_safe_decode(L, s2, &ch2);
1275 ch1 = utf8_tofold(ch1);
1276 ch2 = utf8_tofold(ch2);
1279 lua_pushinteger(L, ch1 > ch2 ? 1 : -1);
1283 lua_pushinteger(L, 0);
1290#ifndef LUA_MAXCAPTURES
1291# define LUA_MAXCAPTURES 32
1294#define CAP_UNFINISHED (-1)
1295#define CAP_POSITION (-2)
1300 const char *src_init;
1301 const char *src_end;
1308 } capture[LUA_MAXCAPTURES];
1312static const char *match (
MatchState *ms,
const char *s,
const char *p);
1315#if !defined(MAXCCALLS)
1316#define MAXCCALLS 200
1320#define SPECIALS "^$*+?.([%-"
1322static int check_capture (
MatchState *ms,
int l) {
1324 if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
1325 return luaL_error(ms->L,
"invalid capture index %%%d", l + 1);
1329static int capture_to_close (
MatchState *ms) {
1330 int level = ms->level;
1331 while (--level >= 0)
1332 if (ms->capture[level].len == CAP_UNFINISHED)
return level;
1333 return luaL_error(ms->L,
"invalid pattern capture");
1336static const char *classend (
MatchState *ms,
const char *p) {
1338 p = utf8_safe_decode(ms->L, p, &ch);
1342 luaL_error(ms->L,
"malformed pattern (ends with " LUA_QL(
"%%")
")");
1343 return utf8_next(p, ms->p_end);
1349 luaL_error(ms->L,
"malformed pattern (missing " LUA_QL(
"]")
")");
1350 if (*(p++) == L_ESC && p < ms->p_end)
1352 }
while (*p !=
']');
1361static int match_class (utfint
c, utfint cl) {
1363 switch (utf8_tolower(cl)) {
1364#define X(cls, name) case cls: res = utf8_is##name(c); break;
1367 case 'g' : res = utf8_isgraph(
c);
break;
1368 case 'w' : res = utf8_isalnum(
c);
break;
1369 case 'z' : res = (
c == 0);
break;
1370 default:
return (cl ==
c);
1372 return (utf8_islower(cl) ? res : !res);
1375static int matchbracketclass (
MatchState *ms, utfint
c,
const char *p,
const char *ec) {
1384 p = utf8_safe_decode(ms->L, p, &ch);
1386 p = utf8_safe_decode(ms->L, p, &ch);
1387 if (match_class(
c, ch))
1391 const char *np = utf8_safe_decode(ms->L, p, &next);
1392 if (next ==
'-' && np < ec) {
1393 p = utf8_safe_decode(ms->L, np, &next);
1394 if (ch <=
c &&
c <= next)
1397 else if (ch ==
c)
return sig;
1403static int singlematch (
MatchState *ms,
const char *s,
const char *p,
const char *ep) {
1404 if (s >= ms->src_end)
1408 utf8_safe_decode(ms->L, s, &ch);
1409 p = utf8_safe_decode(ms->L, p, &pch);
1412 case L_ESC: utf8_safe_decode(ms->L, p, &pch);
1413 return match_class(ch, pch);
1414 case '[':
return matchbracketclass(ms, ch, p-1, ep-1);
1415 default:
return pch == ch;
1420static const char *matchbalance (
MatchState *ms,
const char *s,
const char **p) {
1421 utfint ch=0, begin=0, end=0;
1422 *p = utf8_safe_decode(ms->L, *p, &begin);
1423 if (*p >= ms->p_end)
1424 luaL_error(ms->L,
"malformed pattern "
1425 "(missing arguments to " LUA_QL(
"%%b")
")");
1426 *p = utf8_safe_decode(ms->L, *p, &end);
1427 s = utf8_safe_decode(ms->L, s, &ch);
1428 if (ch != begin)
return NULL;
1431 while (s < ms->src_end) {
1432 s = utf8_safe_decode(ms->L, s, &ch);
1434 if (--cont == 0)
return s;
1436 else if (ch == begin) cont++;
1442static const char *max_expand (
MatchState *ms,
const char *s,
const char *p,
const char *ep) {
1444 while (singlematch(ms, m, p, ep))
1445 m = utf8_next(m, ms->src_end);
1448 const char *res = match(ms, m, ep+1);
1449 if (res)
return res;
1452 m = utf8_prev(s, m);
1457static const char *min_expand (
MatchState *ms,
const char *s,
const char *p,
const char *ep) {
1459 const char *res = match(ms, s, ep+1);
1462 else if (singlematch(ms, s, p, ep))
1463 s = utf8_next(s, ms->src_end);
1468static const char *start_capture (
MatchState *ms,
const char *s,
const char *p,
int what) {
1470 int level = ms->level;
1471 if (level >= LUA_MAXCAPTURES) luaL_error(ms->L,
"too many captures");
1472 ms->capture[level].init = s;
1473 ms->capture[level].len = what;
1474 ms->level = level+1;
1475 if ((res=match(ms, s, p)) == NULL)
1480static const char *end_capture (
MatchState *ms,
const char *s,
const char *p) {
1481 int l = capture_to_close(ms);
1483 ms->capture[l].len = s - ms->capture[l].init;
1484 if ((res = match(ms, s, p)) == NULL)
1485 ms->capture[l].len = CAP_UNFINISHED;
1489static const char *match_capture (
MatchState *ms,
const char *s,
int l) {
1491 l = check_capture(ms, l);
1492 len = ms->capture[l].len;
1493 if ((
size_t)(ms->src_end-s) >= len &&
1494 memcmp(ms->capture[l].init, s, len) == 0)
1499static const char *match (
MatchState *ms,
const char *s,
const char *p) {
1500 if (ms->matchdepth-- == 0)
1501 luaL_error(ms->L,
"pattern too complex");
1503 if (p != ms->p_end) {
1505 utf8_safe_decode(ms->L, p, &ch);
1508 if (*(p + 1) ==
')')
1509 s = start_capture(ms, s, p + 2, CAP_POSITION);
1511 s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
1515 s = end_capture(ms, s, p + 1);
1519 if ((p + 1) != ms->p_end)
1521 s = (s == ms->src_end) ? s : NULL;
1525 const char *prev_p = p;
1526 p = utf8_safe_decode(ms->L, p+1, &ch);
1529 s = matchbalance(ms, s, &p);
1536 const char *ep; utfint previous = 0, current = 0;
1538 luaL_error(ms->L,
"missing " LUA_QL(
"[")
" after "
1539 LUA_QL(
"%%f")
" in pattern");
1540 ep = classend(ms, p);
1541 if (s != ms->src_init)
1542 utf8_decode(utf8_prev(ms->src_init, s), &previous, 0);
1543 if (s != ms->src_end)
1544 utf8_decode(s, ¤t, 0);
1545 if (!matchbracketclass(ms, previous, p, ep - 1) &&
1546 matchbracketclass(ms, current, p, ep - 1)) {
1552 case '0':
case '1':
case '2':
case '3':
1553 case '4':
case '5':
case '6':
case '7':
1554 case '8':
case '9': {
1555 s = match_capture(ms, s, ch);
1556 if (s != NULL)
goto init;
1559 default: p = prev_p;
goto dflt;
1564 const char *ep = classend(ms, p);
1566 if (!singlematch(ms, s, p, ep)) {
1567 if (*ep ==
'*' || *ep ==
'?' || *ep ==
'-') {
1568 p = ep + 1;
goto init;
1572 const char *next_s = utf8_next(s, ms->src_end);
1576 const char *next_ep = utf8_next(ep, ms->p_end);
1577 if ((res = match(ms, next_s, next_ep)) != NULL)
1580 p = next_ep;
goto init;
1588 s = max_expand(ms, s, p, ep);
1591 s = min_expand(ms, s, p, ep);
1594 s = next_s; p = ep;
goto init;
1605static const char *lmemfind (
const char *s1,
size_t l1,
const char *s2,
size_t l2) {
1606 if (l2 == 0)
return s1;
1607 else if (l2 > l1)
return NULL;
1612 while (l1 > 0 && (init = (
const char *)memchr(s1, *s2, l1)) != NULL) {
1614 if (memcmp(init, s2+1, l2) == 0)
1625static int get_index (
const char *p,
const char *s,
const char *e) {
1627 for (idx = 0; s < e && s < p; ++idx)
1628 s = utf8_next(s, e);
1629 return s == p ? idx : idx - 1;
1632static void push_onecapture (
MatchState *ms,
int i,
const char *s,
const char *e) {
1633 if (i >= ms->level) {
1635 lua_pushlstring(ms->L, s, e - s);
1637 luaL_error(ms->L,
"invalid capture index");
1639 ptrdiff_t l = ms->capture[i].len;
1640 if (l == CAP_UNFINISHED) luaL_error(ms->L,
"unfinished capture");
1641 if (l == CAP_POSITION) {
1642 int idx = get_index(ms->capture[i].init, ms->src_init, ms->src_end);
1643 lua_pushinteger(ms->L, idx+1);
1645 lua_pushlstring(ms->L, ms->capture[i].init, l);
1649static int push_captures (
MatchState *ms,
const char *s,
const char *e) {
1651 int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
1652 luaL_checkstack(ms->L, nlevels,
"too many captures");
1653 for (i = 0; i < nlevels; i++)
1654 push_onecapture(ms, i, s, e);
1659static int nospecials (
const char *p,
const char * ep) {
1661 if (strpbrk(p, SPECIALS))
1671static int find_aux (lua_State *L,
int find) {
1672 const char *es, *s = check_utf8(L, 1, &es);
1673 const char *ep, *p = check_utf8(L, 2, &ep);
1674 lua_Integer idx = luaL_optinteger(L, 3, 1);
1677 init = utf8_relat(s, es, CAST(
int, idx));
1686 if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) {
1688 const char *s2 = lmemfind(init, es-init, p, ep-p);
1690 const char *e2 = s2 + (ep - p);
1691 if (iscont(e2)) e2 = utf8_next(e2, es);
1692 lua_pushinteger(L, idx = get_index(s2, s, es) + 1);
1693 lua_pushinteger(L, idx + get_index(e2, s2, es) - 1);
1698 int anchor = (*p ==
'^');
1700 if (idx < 0) idx += utf8_length(s, es)+1;
1702 ms.matchdepth = MAXCCALLS;
1709 assert(ms.matchdepth == MAXCCALLS);
1710 if ((res=match(&ms, init, p)) != NULL) {
1712 lua_pushinteger(L, idx);
1713 lua_pushinteger(L, idx + utf8_length(init, res) - 1);
1714 return push_captures(&ms, NULL, 0) + 2;
1716 return push_captures(&ms, init, res);
1718 if (init == es)
break;
1720 init = utf8_next(init, es);
1721 }
while (init <= es && !anchor);
1727static int Lutf8_find (lua_State *L) {
return find_aux(L, 1); }
1728static int Lutf8_match (lua_State *L) {
return find_aux(L, 0); }
1730static int gmatch_aux (lua_State *L) {
1732 const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es);
1733 const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep);
1736 ms.matchdepth = MAXCCALLS;
1740 for (src = s + (
size_t)lua_tointeger(L, lua_upvalueindex(3));
1742 src = utf8_next(src, ms.src_end)) {
1745 assert(ms.matchdepth == MAXCCALLS);
1746 if ((e = match(&ms, src, p)) != NULL) {
1747 lua_Integer newstart = e-s;
1748 if (e == src) newstart++;
1749 lua_pushinteger(L, newstart);
1750 lua_replace(L, lua_upvalueindex(3));
1751 return push_captures(&ms, src, e);
1753 if (src == ms.src_end)
break;
1758static int Lutf8_gmatch (lua_State *L) {
1759 luaL_checkstring(L, 1);
1760 luaL_checkstring(L, 2);
1762 lua_pushinteger(L, 0);
1763 lua_pushcclosure(L, gmatch_aux, 3);
1767static void add_s (
MatchState *ms, luaL_Buffer *b,
const char *s,
const char *e) {
1768 const char *new_end, *news = to_utf8(ms->L, 3, &new_end);
1769 while (news < new_end) {
1771 news = utf8_safe_decode(ms->L, news, &ch);
1773 add_utf8char(b, ch);
1775 news = utf8_safe_decode(ms->L, news, &ch);
1776 if (!utf8_isdigit(ch)) {
1778 luaL_error(ms->L,
"invalid use of " LUA_QL(
"%c")
1779 " in replacement string", L_ESC);
1780 add_utf8char(b, ch);
1781 }
else if (ch ==
'0')
1782 luaL_addlstring(b, s, e-s);
1784 push_onecapture(ms, ch-
'1', s, e);
1791static void add_value (
MatchState *ms, luaL_Buffer *b,
const char *s,
const char *e,
int tr) {
1792 lua_State *L = ms->L;
1794 case LUA_TFUNCTION: {
1796 lua_pushvalue(L, 3);
1797 n = push_captures(ms, s, e);
1802 push_onecapture(ms, 0, s, e);
1811 if (!lua_toboolean(L, -1)) {
1813 lua_pushlstring(L, s, e - s);
1814 }
else if (!lua_isstring(L, -1))
1815 luaL_error(L,
"invalid replacement value (a %s)", luaL_typename(L, -1));
1819static int Lutf8_gsub (lua_State *L) {
1820 const char *es, *s = check_utf8(L, 1, &es);
1821 const char *ep, *p = check_utf8(L, 2, &ep);
1822 int tr = lua_type(L, 3);
1823 lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1);
1824 int anchor = (*p ==
'^');
1828 luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
1829 tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
1830 "string/function/table expected");
1831 luaL_buffinit(L, &b);
1834 ms.matchdepth = MAXCCALLS;
1841 assert(ms.matchdepth == MAXCCALLS);
1842 e = match(&ms, s, p);
1845 add_value(&ms, &b, s, e, tr);
1851 s = utf8_safe_decode(L, s, &ch);
1852 add_utf8char(&b, ch);
1856 luaL_addlstring(&b, s, es-s);
1857 luaL_pushresult(&b);
1858 lua_pushinteger(L, n);
1862static int Lutf8_isvalid(lua_State *L) {
1863 const char *e, *s = check_utf8(L, 1, &e);
1864 const char *invalid = utf8_invalid_offset(s, e);
1865 lua_pushboolean(L, invalid == NULL);
1869static int Lutf8_invalidoffset(lua_State *L) {
1870 const char *e, *s = check_utf8(L, 1, &e);
1871 const char *orig_s = s;
1872 int offset = luaL_optinteger(L, 2, 0);
1880 }
else if (offset < 0 && s - e < offset) {
1883 const char *invalid = utf8_invalid_offset(s, e);
1884 if (invalid == NULL) {
1887 lua_pushinteger(L, invalid - orig_s + 1);
1892static int Lutf8_clean(lua_State *L) {
1893 const char *e, *s = check_utf8(L, 1, &e);
1897 const char *r = luaL_optlstring(L, 2,
"\xEF\xBF\xBD", &repl_len);
1899 if (lua_gettop(L) > 1) {
1901 if (utf8_invalid_offset(r, r + repl_len) != NULL) {
1902 lua_pushstring(L,
"replacement string must be valid UTF-8");
1907 const char *invalid = utf8_invalid_offset(s, e);
1908 if (invalid == NULL) {
1910 lua_pushboolean(L, 1);
1915 luaL_buffinit(L, &buff);
1920 luaL_addlstring(&buff, s, invalid - s);
1921 luaL_addlstring(&buff, r, repl_len);
1927 while (s == invalid) {
1929 invalid = utf8_invalid_offset(s, e);
1931 if (invalid == NULL) {
1932 luaL_addlstring(&buff, s, e - s);
1933 luaL_pushresult(&buff);
1934 lua_pushboolean(L, 0);
1940static int Lutf8_isnfc(lua_State *L) {
1941 const char *e, *s = check_utf8(L, 1, &e);
1942 utfint starter = 0, ch;
1943 unsigned int prev_canon_cls = 0;
1946 s = utf8_decode(s, &ch, 1);
1948 lua_pushstring(L,
"string is not valid UTF-8");
1957 unsigned int canon_cls = lookup_canon_cls(ch);
1958 if (canon_cls && canon_cls < prev_canon_cls) {
1960 lua_pushboolean(L, 0);
1965 if (entry && !nfc_check(ch, entry, starter, canon_cls, prev_canon_cls)) {
1966 lua_pushboolean(L, 0);
1970 prev_canon_cls = canon_cls;
1975 lua_pushboolean(L, 1);
1979static int Lutf8_normalize_nfc(lua_State *L) {
1980 const char *e, *s = check_utf8(L, 1, &e), *p = s, *starter_p = s;
1981 utfint starter = 0, ch;
1982 unsigned int prev_canon_cls = 0;
1987 const char *new_p = utf8_decode(p, &ch, 1);
1988 if (new_p == NULL) {
1989 lua_pushstring(L,
"string is not valid UTF-8");
1993 unsigned int canon_cls = lookup_canon_cls(ch);
1994 if (canon_cls && canon_cls < prev_canon_cls) {
1999 if (entry && !nfc_check(ch, entry, starter, canon_cls, prev_canon_cls)) {
2003 prev_canon_cls = canon_cls;
2012 lua_pushboolean(L, 1);
2018 luaL_buffinit(L, &buff);
2019 luaL_addlstring(&buff, s, starter_p - s);
2021 string_to_nfc(L, &buff, starter_p, e);
2023 luaL_pushresult(&buff);
2024 lua_pushboolean(L, 0);
2028static int iterate_grapheme_indices(lua_State *L) {
2029 const char *s = luaL_checkstring(L, lua_upvalueindex(1));
2030 lua_Integer pos = luaL_checkinteger(L, lua_upvalueindex(2));
2031 lua_Integer end = luaL_checkinteger(L, lua_upvalueindex(3));
2037 const char *e = s + end;
2040 const char *p = utf8_safe_decode(L, s + pos - 1, &ch);
2043 const char *next_p = utf8_safe_decode(L, p, &next_ch);
2047 if (next_ch ==
'\n') {
2053 }
else if (ch ==
'\n' || next_ch ==
'\r' || next_ch ==
'\n') {
2056 }
else if (find_in_range(cntrl_table, table_size(cntrl_table), ch) && !find_in_range(prepend_table, table_size(prepend_table), ch) && ch != 0x200D) {
2059 }
else if (next_ch == 0x200D) {
2061 if (next_p < e && find_in_range(pictographic_table, table_size(pictographic_table), ch)) {
2064 const char *probe_ep = utf8_safe_decode(L, next_p, &nextnext_ch);
2065 if (find_in_range(pictographic_table, table_size(pictographic_table), nextnext_ch)) {
2072 }
else if (find_in_range(cntrl_table, table_size(cntrl_table), next_ch) && !find_in_range(prepend_table, table_size(prepend_table), next_ch)) {
2076 if (indic_conjunct_type(ch) == INDIC_CONSONANT) {
2077 utfint probed_ch = next_ch;
2078 const char *probe = next_p;
2079 int indic_type = indic_conjunct_type(probed_ch);
2081 while (indic_type) {
2084 if (indic_type == INDIC_LINKER) {
2086 }
else if (indic_type == INDIC_CONSONANT) {
2091 goto next_iteration;
2095 probe = utf8_safe_decode(L, probe, &probed_ch);
2096 indic_type = indic_conjunct_type(probed_ch);
2100 if (find_in_range(compose_table, table_size(compose_table), next_ch) || (next_ch >= 0x1F3FB && next_ch <= 0x1F3FF)) {
2102 if (next_p < e && find_in_range(pictographic_table, table_size(pictographic_table), ch)) {
2105 const char *probe = next_p;
2107 probe = utf8_safe_decode(L, probe, &probed_ch);
2108 if (probed_ch == 0x200D) {
2110 probe = utf8_safe_decode(L, probe, &probed_ch);
2111 if (find_in_range(pictographic_table, table_size(pictographic_table), probed_ch)) {
2113 next_ch = probed_ch;
2117 }
else if (find_in_range(compose_table, table_size(compose_table), probed_ch) || (probed_ch >= 0x1F3FB && probed_ch <= 0x1F3FF)) {
2119 next_ch = probed_ch;
2126 }
else if (find_in_range(spacing_mark_table, table_size(spacing_mark_table), next_ch)) {
2129 }
else if (find_in_range(prepend_table, table_size(prepend_table), ch)) {
2133 }
else if (ch >= 0x1F1E6 && ch <= 0x1F1FF && next_ch >= 0x1F1E6 && next_ch <= 0x1F1FF) {
2141 int hangul1 = hangul_type(ch);
2143 int hangul2 = hangul_type(next_ch);
2145 if (hangul1 == HANGUL_L) {
2146 bind = (hangul2 != HANGUL_T);
2147 }
else if (hangul1 == HANGUL_LV || hangul1 == HANGUL_V) {
2148 bind = (hangul2 == HANGUL_V || hangul2 == HANGUL_T);
2149 }
else if (hangul1 == HANGUL_LVT || hangul1 == HANGUL_T) {
2150 bind = (hangul2 == HANGUL_T);
2164 lua_pushinteger(L, (p - s) + 1);
2165 lua_replace(L, lua_upvalueindex(2));
2167 lua_pushinteger(L, pos);
2168 lua_pushinteger(L, p - s);
2172static int Lutf8_grapheme_indices(lua_State *L) {
2174 const char *s = luaL_checklstring(L, 1, &len);
2175 lua_Integer start = byte_relat(luaL_optinteger(L, 2, 1), len);
2176 lua_Integer end = byte_relat(luaL_optinteger(L, 3, len), len);
2177 luaL_argcheck(L, start >= 1, 2,
"out of range");
2178 luaL_argcheck(L, end <= (lua_Integer)len, 3,
"out of range");
2181 lua_pushinteger(L, start);
2182 lua_pushinteger(L, end);
2183 lua_pushcclosure(L, iterate_grapheme_indices, 3);
2189#if LUA_VERSION_NUM >= 502
2190static const char UTF8PATT[] =
"[\0-\x7F\xC2-\xF4][\x80-\xBF]*";
2192static const char UTF8PATT[] =
"[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*";
2195LUALIB_API
int luaopen_utf8 (lua_State *L) {
2197#define ENTRY(name) { #name, Lutf8_##name }
2224 ENTRY(invalidoffset),
2227 ENTRY(normalize_nfc),
2228 ENTRY(grapheme_indices),
2233#if LUA_VERSION_NUM >= 502
2234 luaL_newlib(L, libs);
2236 luaL_register(L,
"utf8", libs);
2239 lua_pushlstring(L, UTF8PATT,
sizeof(UTF8PATT)-1);
2240 lua_setfield(L, -2,
"charpattern");