5 // bytes required to store a given character
6 static int cbytes(unsigned int c)
8 static const int NB[] = { 0x7f, 0x07ff, 0xffff, 0x001fffff, 0x03ffffff };
10 for(i=0; i<(sizeof(NB)/sizeof(NB[0])) && c>NB[i]; i++) {}
14 // Returns a byte with the N high order bits set
15 #define TOPBITS(n) ((unsigned char)(((signed char)0x80)>>((n)-1)))
17 // write a utf8 character, return bytes written or zero on error
18 static int writec(unsigned int c, unsigned char* s, int len)
22 for(i=n-1; i>0; i--) {
23 s[i] = 0x80 | (c & 0x3f);
26 s[0] = (n > 1 ? TOPBITS(n) : 0) | c;
30 // read a utf8 character, or -1 on error.
31 static int readc(unsigned char* s, int len, int* used)
34 if(len > 0 && s[0] < 0x80) { *used = 1; return s[0]; }
36 if((s[0] & TOPBITS(n+1)) == TOPBITS(n))
38 if(len < n || n > 6) return -1;
39 c = s[0] & (~TOPBITS(n+1));
41 if((s[i] >> 6) != 2) return -1;
42 c = (c << 6) | (s[i] & 0x3f);
44 if(n != cbytes(c)) return -1;
49 /* Public symbol used by the parser */
50 int naLexUtf8C(char* s, int len, int* used)
51 { return readc((void*)s, len, used); }
53 static unsigned char* nthchar(unsigned char* s, int n, int* len)
56 for(i=0; *len && i<n; i++) {
57 if(readc(s, *len, &bytes) < 0) return 0;
58 s += bytes; *len -= bytes;
63 static naRef f_chstr(naContext ctx, naRef me, int argc, naRef* args)
68 if(argc < 1 || naIsNil(ch=naNumValue(args[0])))
69 naRuntimeError(ctx, "bad/missing argument to utf8.chstr");
70 n = writec((int)ch.num, buf, sizeof(buf));
71 return naStr_fromdata(naNewString(ctx), (void*)buf, n);
74 static naRef f_size(naContext c, naRef me, int argc, naRef* args)
78 if(argc < 1 || !naIsString(args[0]))
79 naRuntimeError(c, "bad/missing argument to utf8.strc");
80 s = (void*)naStr_data(args[0]);
81 len = naStr_len(args[0]);
83 if(readc(s, len, &n) < 0)
84 naRuntimeError(c, "utf8 encoding error in utf8.size");
85 sz++; len -= n; s += n;
90 static naRef f_strc(naContext ctx, naRef me, int argc, naRef* args)
95 if(argc < 2 || !naIsString(args[0]) || naIsNil(idx=naNumValue(args[1])))
96 naRuntimeError(ctx, "bad/missing argument to utf8.strc");
97 len = naStr_len(args[0]);
98 s = nthchar((void*)naStr_data(args[0]), (int)idx.num, &len);
99 if(!s || (c = readc(s, len, &bytes)) < 0)
100 naRuntimeError(ctx, "utf8 encoding error in utf8.strc");
104 static naRef f_substr(naContext c, naRef me, int argc, naRef* args)
108 unsigned char *s, *s2;
109 end = argc > 2 ? naNumValue(args[2]) : naNil();
110 if((argc < 2 || !naIsString(args[0]) || naIsNil(start=naNumValue(args[1])))
111 || (argc > 2 && naIsNil(end)))
112 naRuntimeError(c, "bad/missing argument to utf8.substr");
113 len = naStr_len(args[0]);
114 if(!(s = nthchar((void*)naStr_data(args[0]), (int)start.num, &len)))
115 naRuntimeError(c, "start index overrun in utf8.substr");
117 if(!(s2 = nthchar(s, (int)end.num, &len)))
118 naRuntimeError(c, "end index overrun in utf8.substr");
121 return naStr_fromdata(naNewString(c), (void*)s, len);
124 static naRef f_validate(naContext c, naRef me, int argc, naRef* args)
126 naRef result, unkc=naNil();
127 int len, len2, lenout=0, n;
128 unsigned char *s, *s2, *buf;
129 if(argc < 1 || !naIsString(args[0]) ||
130 (argc > 1 && naIsNil(unkc=naNumValue(args[1]))))
131 naRuntimeError(c, "bad/missing argument to utf8.strc");
132 if(naIsNil(unkc)) unkc = naNum('?');
133 len = naStr_len(args[0]);
134 s = (void*)naStr_data(args[0]);
135 len2 = 6*len; // max for ridiculous unkc values
136 s2 = buf = naAlloc(len2);
138 int c = readc(s, len, &n);
139 if(c < 0) { c = (int)unkc.num; n = 1; }
141 n = writec(c, s2, len2);
142 s2 += n; len2 -= n; lenout += n;
144 result = naStr_fromdata(naNewString(c), (char*)buf, lenout);
149 static naCFuncItem funcs[] = {
150 { "chstr", f_chstr },
152 { "substr", f_substr },
154 { "validate", f_validate },
158 naRef naInit_utf8(naContext c)
160 return naGenLib(c, funcs);