5 // bytes required to store a given character
6 static int cbytes(unsigned int c)
8 static const int NB[] = { 0x7f, 0x07ff, 0xffff, 0x001fffff, 0x03ffffff };
10 for(i=0; i<(sizeof(NB)/sizeof(NB[0])) && c>NB[i]; i++) {}
14 // Returns a byte with the N high order bits set
15 #define TOPBITS(n) ((unsigned char)(((signed char)0x80)>>((n)-1)))
17 // write a utf8 character, return bytes written or zero on error
18 static int writec(unsigned int c, unsigned char* s, int len)
22 for(i=n-1; i>0; i--) {
23 s[i] = 0x80 | (c & 0x3f);
26 s[0] = (n > 1 ? TOPBITS(n) : 0) | c;
30 // read a utf8 character, or -1 on error.
31 static int readc(unsigned char* s, int len, int* used)
35 if(s[0] < 0x80) { *used = 1; return s[0]; }
37 if((s[0] & TOPBITS(n+1)) == TOPBITS(n))
39 if(len < n || n > 6) return -1;
40 c = s[0] & (~TOPBITS(n+1));
42 if((s[i] >> 6) != 2) return -1;
43 c = (c << 6) | (s[i] & 0x3f);
45 if(n != cbytes(c)) return -1;
50 /* Public symbol used by the parser */
51 int naLexUtf8C(char* s, int len, int* used)
52 { return readc((void*)s, len, used); }
54 static unsigned char* nthchar(unsigned char* s, int n, int* len)
57 for(i=0; *len && i<n; i++) {
58 if(readc(s, *len, &bytes) < 0) return 0;
59 s += bytes; *len -= bytes;
64 static naRef f_chstr(naContext ctx, naRef me, int argc, naRef* args)
69 if(argc < 1 || naIsNil(ch=naNumValue(args[0])))
70 naRuntimeError(ctx, "bad/missing argument to utf8.chstr");
71 n = writec((int)ch.num, buf, sizeof(buf));
72 return naStr_fromdata(naNewString(ctx), (void*)buf, n);
75 static naRef f_size(naContext c, naRef me, int argc, naRef* args)
79 if(argc < 1 || !naIsString(args[0]))
80 naRuntimeError(c, "bad/missing argument to utf8.strc");
81 s = (void*)naStr_data(args[0]);
82 len = naStr_len(args[0]);
84 if(readc(s, len, &n) < 0)
85 naRuntimeError(c, "utf8 encoding error in utf8.size");
86 sz++; len -= n; s += n;
91 static naRef f_strc(naContext ctx, naRef me, int argc, naRef* args)
96 if(argc < 2 || !naIsString(args[0]) || naIsNil(idx=naNumValue(args[1])))
97 naRuntimeError(ctx, "bad/missing argument to utf8.strc");
98 len = naStr_len(args[0]);
99 s = nthchar((void*)naStr_data(args[0]), (int)idx.num, &len);
100 if(!s || (c = readc(s, len, &bytes)) < 0)
101 naRuntimeError(ctx, "utf8 encoding error in utf8.strc");
105 static naRef f_substr(naContext c, naRef me, int argc, naRef* args)
109 unsigned char *s, *s2;
110 end = argc > 2 ? naNumValue(args[2]) : naNil();
111 if((argc < 2 || !naIsString(args[0]) || naIsNil(start=naNumValue(args[1])))
112 || (argc > 2 && naIsNil(end)))
113 naRuntimeError(c, "bad/missing argument to utf8.substr");
114 len = naStr_len(args[0]);
115 if(!(s = nthchar((void*)naStr_data(args[0]), (int)start.num, &len)))
116 naRuntimeError(c, "start index overrun in utf8.substr");
118 if(!(s2 = nthchar(s, (int)end.num, &len)))
119 naRuntimeError(c, "end index overrun in utf8.substr");
122 return naStr_fromdata(naNewString(c), (void*)s, len);
125 static naRef f_validate(naContext c, naRef me, int argc, naRef* args)
127 naRef result, unkc=naNil();
128 int len, len2, lenout=0, n;
129 unsigned char *s, *s2, *buf;
130 if(argc < 1 || !naIsString(args[0]) ||
131 (argc > 1 && naIsNil(unkc=naNumValue(args[1]))))
132 naRuntimeError(c, "bad/missing argument to utf8.strc");
133 if(naIsNil(unkc)) unkc = naNum('?');
134 len = naStr_len(args[0]);
135 s = (void*)naStr_data(args[0]);
136 len2 = 6*len; // max for ridiculous unkc values
137 s2 = buf = naAlloc(len2);
139 int c = readc(s, len, &n);
140 if(c < 0) { c = (int)unkc.num; n = 1; }
142 n = writec(c, s2, len2);
143 s2 += n; len2 -= n; lenout += n;
145 result = naStr_fromdata(naNewString(c), (char*)buf, lenout);
150 static naCFuncItem funcs[] = {
151 { "chstr", f_chstr },
153 { "substr", f_substr },
155 { "validate", f_validate },
159 naRef naInit_utf8(naContext c)
161 return naGenLib(c, funcs);