mirror of
https://github.com/golang/go
synced 2024-11-22 06:24:38 -07:00
update to Unicode 5
SVN=126184
This commit is contained in:
parent
0d079a5362
commit
5b904a3bde
@ -34,7 +34,7 @@ asin(arg double)double
|
||||
sign = true;
|
||||
}
|
||||
if arg > 1 {
|
||||
panic "return sys.NaN()";
|
||||
return sys.NaN();
|
||||
}
|
||||
|
||||
temp = sqrt(1 - x*x);
|
||||
@ -54,7 +54,7 @@ func
|
||||
acos(arg double)double
|
||||
{
|
||||
if(arg > 1 || arg < -1) {
|
||||
panic "return sys.NaN()";
|
||||
return sys.NaN();
|
||||
}
|
||||
return pio2 - asin(arg);
|
||||
}
|
||||
|
@ -40,7 +40,7 @@ exp(arg double) double
|
||||
return 0.;
|
||||
}
|
||||
if arg > maxf {
|
||||
panic "return sys.Inf(1)"
|
||||
return sys.Inf(1)
|
||||
}
|
||||
|
||||
x = arg*log2e;
|
||||
|
@ -36,7 +36,7 @@ log(arg double) double
|
||||
var exp int;
|
||||
|
||||
if arg <= 0 {
|
||||
panic "return sys.NaN()";
|
||||
return sys.NaN();
|
||||
}
|
||||
|
||||
exp,x = sys.frexp(arg);
|
||||
@ -63,7 +63,7 @@ log10(arg double) double
|
||||
{
|
||||
|
||||
if arg <= 0 {
|
||||
panic "return sys.NaN()";
|
||||
return sys.NaN();
|
||||
}
|
||||
return log(arg) * ln10o1;
|
||||
}
|
||||
|
@ -5,7 +5,25 @@
|
||||
|
||||
package main
|
||||
|
||||
import math "math"
|
||||
//import math "math"
|
||||
//////////////////
|
||||
import math "asin"
|
||||
import math "atan"
|
||||
import math "atan2"
|
||||
import math "exp"
|
||||
import math "fabs"
|
||||
import math "floor"
|
||||
import math "fmod"
|
||||
import math "hypot"
|
||||
import math "log"
|
||||
import math "pow"
|
||||
import math "pow10"
|
||||
import math "sin"
|
||||
import math "sinh"
|
||||
import math "sqrt"
|
||||
import math "tan"
|
||||
import math "tanh"
|
||||
|
||||
|
||||
const
|
||||
(
|
||||
|
@ -26,14 +26,14 @@ pow(arg1,arg2 double) double
|
||||
if arg1 <= 0 {
|
||||
if(arg1 == 0) {
|
||||
if arg2 <= 0 {
|
||||
panic "return sys.NaN()";
|
||||
return sys.NaN();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
temp = floor(arg2);
|
||||
if temp != arg2 {
|
||||
panic "return sys.NaN()";
|
||||
panic sys.NaN();
|
||||
}
|
||||
|
||||
l = long(temp);
|
||||
|
@ -48,7 +48,7 @@ sinh(arg double) double
|
||||
temp = exp(arg)/2;
|
||||
|
||||
case arg > 0.5:
|
||||
// temp = (exp(arg) - exp(-arg))/2;
|
||||
temp = (exp(arg) - exp(-arg))/2;
|
||||
|
||||
default:
|
||||
argsq = arg*arg;
|
||||
@ -71,5 +71,5 @@ cosh(arg double) double
|
||||
if arg > 21 {
|
||||
return exp(arg)/2;
|
||||
}
|
||||
// return (exp(arg) + exp(-arg))/2;
|
||||
return (exp(arg) + exp(-arg))/2;
|
||||
}
|
||||
|
@ -19,11 +19,10 @@ sqrt(arg double) double
|
||||
var x, temp double;
|
||||
var exp, i int;
|
||||
|
||||
/* BUG: NO isINF
|
||||
if sys.isInf(arg, 1) {
|
||||
return arg;
|
||||
}
|
||||
*/
|
||||
|
||||
if arg <= 0 {
|
||||
if arg < 0 {
|
||||
panic "return sys.NaN()"
|
||||
|
@ -62,7 +62,7 @@ tan(arg double) double
|
||||
|
||||
if flag {
|
||||
if(temp == 0) {
|
||||
panic "return sys.NaN()";
|
||||
panic sys.NaN();
|
||||
}
|
||||
temp = 1/temp;
|
||||
}
|
||||
|
733
src/lib9/utf/mkrunetype.c
Normal file
733
src/lib9/utf/mkrunetype.c
Normal file
@ -0,0 +1,733 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
/*
|
||||
* make is(upper|lower|title|space|alpha)rune and
|
||||
* to(upper|lower|title)rune from a UnicodeData.txt file.
|
||||
* these can be found at unicode.org
|
||||
*
|
||||
* with -c, runs a check of the existing runetype functions vs.
|
||||
* those extracted from UnicodeData.
|
||||
*
|
||||
* with -p, generates tables for pairs of chars, as well as for ranges
|
||||
* and singletons.
|
||||
*
|
||||
* UnicodeData defines 4 fields of interest:
|
||||
* 1) a category
|
||||
* 2) an upper case mapping
|
||||
* 3) a lower case mapping
|
||||
* 4) a title case mapping
|
||||
*
|
||||
* toupper, tolower, and totitle are defined directly from the mapping.
|
||||
*
|
||||
* isalpharune(c) is true iff c is a "letter" category
|
||||
* isupperrune(c) is true iff c is the target of toupperrune,
|
||||
* or is in the uppercase letter category
|
||||
* similarly for islowerrune and istitlerune.
|
||||
* isspacerune is true for space category chars, "C" locale white space chars,
|
||||
* and two additions:
|
||||
* 0085 "next line" control char
|
||||
* feff] "zero-width non-break space"
|
||||
* isdigitrune is true iff c is a numeric-digit category.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include <libgen.h>
|
||||
#include "utf.h"
|
||||
#include "utfdef.h"
|
||||
|
||||
enum {
|
||||
/*
|
||||
* fields in the unicode data file
|
||||
*/
|
||||
FIELD_CODE,
|
||||
FIELD_NAME,
|
||||
FIELD_CATEGORY,
|
||||
FIELD_COMBINING,
|
||||
FIELD_BIDIR,
|
||||
FIELD_DECOMP,
|
||||
FIELD_DECIMAL_DIG,
|
||||
FIELD_DIG,
|
||||
FIELD_NUMERIC_VAL,
|
||||
FIELD_MIRRORED,
|
||||
FIELD_UNICODE_1_NAME,
|
||||
FIELD_COMMENT,
|
||||
FIELD_UPPER,
|
||||
FIELD_LOWER,
|
||||
FIELD_TITLE,
|
||||
NFIELDS,
|
||||
|
||||
MAX_LINE = 1024,
|
||||
|
||||
TO_OFFSET = 1 << 20,
|
||||
|
||||
NRUNES = 1 << 21,
|
||||
};
|
||||
|
||||
#define TO_DELTA(xmapped,x) (TO_OFFSET + (xmapped) - (x))
|
||||
|
||||
static char myisspace[NRUNES];
|
||||
static char myisalpha[NRUNES];
|
||||
static char myisdigit[NRUNES];
|
||||
static char myisupper[NRUNES];
|
||||
static char myislower[NRUNES];
|
||||
static char myistitle[NRUNES];
|
||||
|
||||
static int mytoupper[NRUNES];
|
||||
static int mytolower[NRUNES];
|
||||
static int mytotitle[NRUNES];
|
||||
|
||||
static void check(void);
|
||||
static void mktables(char *src, int usepairs);
|
||||
static void fatal(const char *fmt, ...);
|
||||
static int mygetfields(char **fields, int nfields, char *str, const char *delim);
|
||||
static int getunicodeline(FILE *in, char **fields, char *buf);
|
||||
static int getcode(char *s);
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
{
|
||||
fprintf(stderr, "usage: mktables [-cp] <UnicodeData.txt>\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char *argv[]){
|
||||
FILE *in;
|
||||
char buf[MAX_LINE], buf2[MAX_LINE];
|
||||
char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
|
||||
char *p;
|
||||
int i, code, last, docheck, usepairs;
|
||||
|
||||
docheck = 0;
|
||||
usepairs = 0;
|
||||
ARGBEGIN{
|
||||
case 'c':
|
||||
docheck = 1;
|
||||
break;
|
||||
case 'p':
|
||||
usepairs = 1;
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
}ARGEND
|
||||
|
||||
if(argc != 1){
|
||||
usage();
|
||||
}
|
||||
|
||||
in = fopen(argv[0], "r");
|
||||
if(in == NULL){
|
||||
fatal("can't open %s", argv[0]);
|
||||
}
|
||||
|
||||
for(i = 0; i < NRUNES; i++){
|
||||
mytoupper[i] = i;
|
||||
mytolower[i] = i;
|
||||
mytotitle[i] = i;
|
||||
}
|
||||
|
||||
/*
|
||||
* make sure isspace has all of the "C" locale whitespace chars
|
||||
*/
|
||||
myisspace['\t'] = 1;
|
||||
myisspace['\n'] = 1;
|
||||
myisspace['\r'] = 1;
|
||||
myisspace['\f'] = 1;
|
||||
myisspace['\v'] = 1;
|
||||
|
||||
/*
|
||||
* a couple of other exceptions
|
||||
*/
|
||||
myisspace[0x85] = 1; /* control char, "next line" */
|
||||
myisspace[0xfeff] = 1; /* zero-width non-break space */
|
||||
|
||||
last = -1;
|
||||
while(getunicodeline(in, fields, buf)){
|
||||
code = getcode(fields[FIELD_CODE]);
|
||||
if (code >= NRUNES)
|
||||
fatal("code-point value too big: %x", code);
|
||||
if(code <= last)
|
||||
fatal("bad code sequence: %x then %x", last, code);
|
||||
last = code;
|
||||
|
||||
/*
|
||||
* check for ranges
|
||||
*/
|
||||
p = fields[FIELD_CATEGORY];
|
||||
if(strstr(fields[FIELD_NAME], ", First>") != NULL){
|
||||
if(!getunicodeline(in, fields2, buf2))
|
||||
fatal("range start at eof");
|
||||
if (strstr(fields2[FIELD_NAME], ", Last>") == NULL)
|
||||
fatal("range start not followed by range end");
|
||||
last = getcode(fields2[FIELD_CODE]);
|
||||
if(last <= code)
|
||||
fatal("range out of sequence: %x then %x", code, last);
|
||||
if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
|
||||
fatal("range with mismatched category");
|
||||
}
|
||||
|
||||
/*
|
||||
* set properties and conversions
|
||||
*/
|
||||
for (; code <= last; code++){
|
||||
if(p[0] == 'L')
|
||||
myisalpha[code] = 1;
|
||||
if(p[0] == 'Z')
|
||||
myisspace[code] = 1;
|
||||
|
||||
if(strcmp(p, "Lu") == 0)
|
||||
myisupper[code] = 1;
|
||||
if(strcmp(p, "Ll") == 0)
|
||||
myislower[code] = 1;
|
||||
|
||||
if(strcmp(p, "Lt") == 0)
|
||||
myistitle[code] = 1;
|
||||
|
||||
if(strcmp(p, "Nd") == 0)
|
||||
myisdigit[code] = 1;
|
||||
|
||||
/*
|
||||
* when finding conversions, also need to mark
|
||||
* upper/lower case, since some chars, like
|
||||
* "III" (0x2162), aren't defined as letters but have a
|
||||
* lower case mapping ("iii" (0x2172)).
|
||||
*/
|
||||
if(fields[FIELD_UPPER][0] != '\0'){
|
||||
mytoupper[code] = getcode(fields[FIELD_UPPER]);
|
||||
}
|
||||
if(fields[FIELD_LOWER][0] != '\0'){
|
||||
mytolower[code] = getcode(fields[FIELD_LOWER]);
|
||||
}
|
||||
if(fields[FIELD_TITLE][0] != '\0'){
|
||||
mytotitle[code] = getcode(fields[FIELD_TITLE]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fclose(in);
|
||||
|
||||
/*
|
||||
* check for codes with no totitle mapping but a toupper mapping.
|
||||
* these appear in UnicodeData-2.0.14.txt, but are almost certainly
|
||||
* erroneous.
|
||||
*/
|
||||
for(i = 0; i < NRUNES; i++){
|
||||
if(mytotitle[i] == i
|
||||
&& mytoupper[i] != i
|
||||
&& !myistitle[i])
|
||||
fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]);
|
||||
}
|
||||
|
||||
/*
|
||||
* make sure isupper[c] is true if for some x toupper[x] == c
|
||||
* ditto for islower and istitle
|
||||
*/
|
||||
for(i = 0; i < NRUNES; i++) {
|
||||
if(mytoupper[i] != i)
|
||||
myisupper[mytoupper[i]] = 1;
|
||||
if(mytolower[i] != i)
|
||||
myislower[mytolower[i]] = 1;
|
||||
if(mytotitle[i] != i)
|
||||
myistitle[mytotitle[i]] = 1;
|
||||
}
|
||||
|
||||
if(docheck){
|
||||
check();
|
||||
}else{
|
||||
mktables(argv[0], usepairs);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* generate a properties array for ranges, clearing those cases covered.
|
||||
* if force, generate one-entry ranges for singletons.
|
||||
*/
|
||||
static int
|
||||
mkisrange(const char* label, char* prop, int force)
|
||||
{
|
||||
int start, stop, some;
|
||||
|
||||
/*
|
||||
* first, the ranges
|
||||
*/
|
||||
some = 0;
|
||||
for(start = 0; start < NRUNES; ) {
|
||||
if(!prop[start]){
|
||||
start++;
|
||||
continue;
|
||||
}
|
||||
|
||||
for(stop = start + 1; stop < NRUNES; stop++){
|
||||
if(!prop[stop]){
|
||||
break;
|
||||
}
|
||||
prop[stop] = 0;
|
||||
}
|
||||
if(force || stop != start + 1){
|
||||
if(!some){
|
||||
printf("static Rune __is%sr[] = {\n", label);
|
||||
some = 1;
|
||||
}
|
||||
prop[start] = 0;
|
||||
printf("\t0x%.4x, 0x%.4x,\n", start, stop - 1);
|
||||
}
|
||||
|
||||
start = stop;
|
||||
}
|
||||
if(some)
|
||||
printf("};\n\n");
|
||||
return some;
|
||||
}
|
||||
|
||||
/*
|
||||
* generate a mapping array for pairs with a skip between,
|
||||
* clearing those entries covered.
|
||||
*/
|
||||
static int
|
||||
mkispair(const char *label, char *prop)
|
||||
{
|
||||
int start, stop, some;
|
||||
|
||||
some = 0;
|
||||
for(start = 0; start + 2 < NRUNES; ) {
|
||||
if(!prop[start]){
|
||||
start++;
|
||||
continue;
|
||||
}
|
||||
|
||||
for(stop = start + 2; stop < NRUNES; stop += 2){
|
||||
if(!prop[stop]){
|
||||
break;
|
||||
}
|
||||
prop[stop] = 0;
|
||||
}
|
||||
if(stop != start + 2){
|
||||
if(!some){
|
||||
printf("static Rune __is%sp[] = {\n", label);
|
||||
some = 1;
|
||||
}
|
||||
prop[start] = 0;
|
||||
printf("\t0x%.4x, 0x%.4x,\n", start, stop - 2);
|
||||
}
|
||||
|
||||
start = stop;
|
||||
}
|
||||
if(some)
|
||||
printf("};\n\n");
|
||||
return some;
|
||||
}
|
||||
|
||||
/*
|
||||
* generate a properties array for singletons, clearing those cases covered.
|
||||
*/
|
||||
static int
|
||||
mkissingle(const char *label, char *prop)
|
||||
{
|
||||
int start, some;
|
||||
|
||||
some = 0;
|
||||
for(start = 0; start < NRUNES; start++) {
|
||||
if(!prop[start]){
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!some){
|
||||
printf("static Rune __is%ss[] = {\n", label);
|
||||
some = 1;
|
||||
}
|
||||
prop[start] = 0;
|
||||
printf("\t0x%.4x,\n", start);
|
||||
}
|
||||
if(some)
|
||||
printf("};\n\n");
|
||||
return some;
|
||||
}
|
||||
|
||||
/*
|
||||
* generate tables and a function for is<label>rune
|
||||
*/
|
||||
static void
|
||||
mkis(const char* label, char* prop, int usepairs)
|
||||
{
|
||||
int isr, isp, iss;
|
||||
|
||||
isr = mkisrange(label, prop, 0);
|
||||
isp = 0;
|
||||
if(usepairs)
|
||||
isp = mkispair(label, prop);
|
||||
iss = mkissingle(label, prop);
|
||||
|
||||
printf(
|
||||
"int\n"
|
||||
"is%srune(Rune c)\n"
|
||||
"{\n"
|
||||
" Rune *p;\n"
|
||||
"\n",
|
||||
label);
|
||||
|
||||
if(isr)
|
||||
printf(
|
||||
" p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n"
|
||||
" if(p && c >= p[0] && c <= p[1])\n"
|
||||
" return 1;\n",
|
||||
label, label);
|
||||
|
||||
if(isp)
|
||||
printf(
|
||||
" p = rbsearch(c, __is%sp, nelem(__is%sp)/2, 2);\n"
|
||||
" if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
|
||||
" return 1;\n",
|
||||
label, label);
|
||||
|
||||
if(iss)
|
||||
printf(
|
||||
" p = rbsearch(c, __is%ss, nelem(__is%ss), 1);\n"
|
||||
" if(p && c == p[0])\n"
|
||||
" return 1;\n",
|
||||
label, label);
|
||||
|
||||
|
||||
printf(
|
||||
" return 0;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate a mapping array for ranges, clearing those entries covered.
|
||||
* if force, generate one-entry ranges for singletons.
|
||||
*/
|
||||
static int
|
||||
mktorange(const char* label, int* map, int force)
|
||||
{
|
||||
int start, stop, delta, some;
|
||||
|
||||
some = 0;
|
||||
for(start = 0; start < NRUNES; ) {
|
||||
if(map[start] == start){
|
||||
start++;
|
||||
continue;
|
||||
}
|
||||
|
||||
delta = TO_DELTA(map[start], start);
|
||||
if(delta != (Rune)delta)
|
||||
fatal("bad map delta %d", delta);
|
||||
for(stop = start + 1; stop < NRUNES; stop++){
|
||||
if(TO_DELTA(map[stop], stop) != delta){
|
||||
break;
|
||||
}
|
||||
map[stop] = stop;
|
||||
}
|
||||
if(stop != start + 1){
|
||||
if(!some){
|
||||
printf("static Rune __to%sr[] = {\n", label);
|
||||
some = 1;
|
||||
}
|
||||
map[start] = start;
|
||||
printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta);
|
||||
}
|
||||
|
||||
start = stop;
|
||||
}
|
||||
if(some)
|
||||
printf("};\n\n");
|
||||
return some;
|
||||
}
|
||||
|
||||
/*
|
||||
* generate a mapping array for pairs with a skip between,
|
||||
* clearing those entries covered.
|
||||
*/
|
||||
static int
|
||||
mktopair(const char* label, int* map)
|
||||
{
|
||||
int start, stop, delta, some;
|
||||
|
||||
some = 0;
|
||||
for(start = 0; start + 2 < NRUNES; ) {
|
||||
if(map[start] == start){
|
||||
start++;
|
||||
continue;
|
||||
}
|
||||
|
||||
delta = TO_DELTA(map[start], start);
|
||||
if(delta != (Rune)delta)
|
||||
fatal("bad map delta %d", delta);
|
||||
for(stop = start + 2; stop < NRUNES; stop += 2){
|
||||
if(TO_DELTA(map[stop], stop) != delta){
|
||||
break;
|
||||
}
|
||||
map[stop] = stop;
|
||||
}
|
||||
if(stop != start + 2){
|
||||
if(!some){
|
||||
printf("static Rune __to%sp[] = {\n", label);
|
||||
some = 1;
|
||||
}
|
||||
map[start] = start;
|
||||
printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta);
|
||||
}
|
||||
|
||||
start = stop;
|
||||
}
|
||||
if(some)
|
||||
printf("};\n\n");
|
||||
return some;
|
||||
}
|
||||
|
||||
/*
|
||||
* generate a mapping array for singletons, clearing those entries covered.
|
||||
*/
|
||||
static int
|
||||
mktosingle(const char* label, int* map)
|
||||
{
|
||||
int start, delta, some;
|
||||
|
||||
some = 0;
|
||||
for(start = 0; start < NRUNES; start++) {
|
||||
if(map[start] == start){
|
||||
continue;
|
||||
}
|
||||
|
||||
delta = TO_DELTA(map[start], start);
|
||||
if(delta != (Rune)delta)
|
||||
fatal("bad map delta %d", delta);
|
||||
if(!some){
|
||||
printf("static Rune __to%ss[] = {\n", label);
|
||||
some = 1;
|
||||
}
|
||||
map[start] = start;
|
||||
printf("\t0x%.4x, %d,\n", start, delta);
|
||||
}
|
||||
if(some)
|
||||
printf("};\n\n");
|
||||
return some;
|
||||
}
|
||||
|
||||
/*
|
||||
* generate tables and a function for to<label>rune
|
||||
*/
|
||||
static void
|
||||
mkto(const char* label, int* map, int usepairs)
|
||||
{
|
||||
int tor, top, tos;
|
||||
|
||||
tor = mktorange(label, map, 0);
|
||||
top = 0;
|
||||
if(usepairs)
|
||||
top = mktopair(label, map);
|
||||
tos = mktosingle(label, map);
|
||||
|
||||
printf(
|
||||
"Rune\n"
|
||||
"to%srune(Rune c)\n"
|
||||
"{\n"
|
||||
" Rune *p;\n"
|
||||
"\n",
|
||||
label);
|
||||
|
||||
if(tor)
|
||||
printf(
|
||||
" p = rbsearch(c, __to%sr, nelem(__to%sr)/3, 3);\n"
|
||||
" if(p && c >= p[0] && c <= p[1])\n"
|
||||
" return c + p[2] - %d;\n",
|
||||
label, label, TO_OFFSET);
|
||||
|
||||
if(top)
|
||||
printf(
|
||||
" p = rbsearch(c, __to%sp, nelem(__to%sp)/3, 3);\n"
|
||||
" if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
|
||||
" return c + p[2] - %d;\n",
|
||||
label, label, TO_OFFSET);
|
||||
|
||||
if(tos)
|
||||
printf(
|
||||
" p = rbsearch(c, __to%ss, nelem(__to%ss)/2, 2);\n"
|
||||
" if(p && c == p[0])\n"
|
||||
" return c + p[1] - %d;\n",
|
||||
label, label, TO_OFFSET);
|
||||
|
||||
|
||||
printf(
|
||||
" return c;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
);
|
||||
}
|
||||
|
||||
// Make only range tables and a function for is<label>rune.
|
||||
static void
|
||||
mkisronly(const char* label, char* prop) {
|
||||
mkisrange(label, prop, 1);
|
||||
printf(
|
||||
"int\n"
|
||||
"is%srune(Rune c)\n"
|
||||
"{\n"
|
||||
" Rune *p;\n"
|
||||
"\n"
|
||||
" p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n"
|
||||
" if(p && c >= p[0] && c <= p[1])\n"
|
||||
" return 1;\n"
|
||||
" return 0;\n"
|
||||
"}\n"
|
||||
"\n",
|
||||
label, label, label);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate the body of runetype.
|
||||
* assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne);
|
||||
*/
|
||||
static void
|
||||
mktables(char *src, int usepairs)
|
||||
{
|
||||
printf("/* generated automatically by mkrunetype.c from %s */\n\n",
|
||||
basename(src));
|
||||
|
||||
/*
|
||||
* we special case the space and digit tables, since they are assumed
|
||||
* to be small with several ranges.
|
||||
*/
|
||||
mkisronly("space", myisspace);
|
||||
mkisronly("digit", myisdigit);
|
||||
|
||||
mkis("alpha", myisalpha, 0);
|
||||
mkis("upper", myisupper, usepairs);
|
||||
mkis("lower", myislower, usepairs);
|
||||
mkis("title", myistitle, usepairs);
|
||||
|
||||
mkto("upper", mytoupper, usepairs);
|
||||
mkto("lower", mytolower, usepairs);
|
||||
mkto("title", mytotitle, usepairs);
|
||||
}
|
||||
|
||||
/*
|
||||
* find differences between the newly generated tables and current runetypes.
|
||||
*/
|
||||
static void
|
||||
check(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for(i = 0; i < NRUNES; i++){
|
||||
if(isdigitrune(i) != myisdigit[i])
|
||||
fprintf(stderr, "isdigit diff at %x: runetype=%x, unicode=%x\n",
|
||||
i, isdigitrune(i), myisdigit[i]);
|
||||
|
||||
if(isspacerune(i) != myisspace[i])
|
||||
fprintf(stderr, "isspace diff at %x: runetype=%x, unicode=%x\n",
|
||||
i, isspacerune(i), myisspace[i]);
|
||||
|
||||
if(isupperrune(i) != myisupper[i])
|
||||
fprintf(stderr, "isupper diff at %x: runetype=%x, unicode=%x\n",
|
||||
i, isupperrune(i), myisupper[i]);
|
||||
|
||||
if(islowerrune(i) != myislower[i])
|
||||
fprintf(stderr, "islower diff at %x: runetype=%x, unicode=%x\n",
|
||||
i, islowerrune(i), myislower[i]);
|
||||
|
||||
if(isalpharune(i) != myisalpha[i])
|
||||
fprintf(stderr, "isalpha diff at %x: runetype=%x, unicode=%x\n",
|
||||
i, isalpharune(i), myisalpha[i]);
|
||||
|
||||
if(toupperrune(i) != mytoupper[i])
|
||||
fprintf(stderr, "toupper diff at %x: runetype=%x, unicode=%x\n",
|
||||
i, toupperrune(i), mytoupper[i]);
|
||||
|
||||
if(tolowerrune(i) != mytolower[i])
|
||||
fprintf(stderr, "tolower diff at %x: runetype=%x, unicode=%x\n",
|
||||
i, tolowerrune(i), mytolower[i]);
|
||||
|
||||
if(istitlerune(i) != myistitle[i])
|
||||
fprintf(stderr, "istitle diff at %x: runetype=%x, unicode=%x\n",
|
||||
i, istitlerune(i), myistitle[i]);
|
||||
|
||||
if(totitlerune(i) != mytotitle[i])
|
||||
fprintf(stderr, "totitle diff at %x: runetype=%x, unicode=%x\n",
|
||||
i, totitlerune(i), mytotitle[i]);
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
mygetfields(char **fields, int nfields, char *str, const char *delim)
|
||||
{
|
||||
int nf;
|
||||
|
||||
fields[0] = str;
|
||||
nf = 1;
|
||||
if(nf >= nfields)
|
||||
return nf;
|
||||
|
||||
for(; *str; str++){
|
||||
if(strchr(delim, *str) != NULL){
|
||||
*str = '\0';
|
||||
fields[nf++] = str + 1;
|
||||
if(nf >= nfields)
|
||||
break;
|
||||
}
|
||||
}
|
||||
return nf;
|
||||
}
|
||||
|
||||
static int
|
||||
getunicodeline(FILE *in, char **fields, char *buf)
|
||||
{
|
||||
char *p;
|
||||
|
||||
if(fgets(buf, MAX_LINE, in) == NULL)
|
||||
return 0;
|
||||
|
||||
p = strchr(buf, '\n');
|
||||
if (p == NULL)
|
||||
fatal("line too long");
|
||||
*p = '\0';
|
||||
|
||||
if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS)
|
||||
fatal("bad number of fields");
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int
|
||||
getcode(char *s)
|
||||
{
|
||||
int i, code;
|
||||
|
||||
code = 0;
|
||||
i = 0;
|
||||
/* Parse a hex number */
|
||||
while(s[i]) {
|
||||
code <<= 4;
|
||||
if(s[i] >= '0' && s[i] <= '9')
|
||||
code += s[i] - '0';
|
||||
else if(s[i] >= 'A' && s[i] <= 'F')
|
||||
code += s[i] - 'A' + 10;
|
||||
else
|
||||
fatal("bad code char '%c'", s[i]);
|
||||
i++;
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
static void
|
||||
fatal(const char *fmt, ...)
|
||||
{
|
||||
va_list arg;
|
||||
|
||||
fprintf(stderr, "%s: fatal error: ", argv0);
|
||||
va_start(arg, fmt);
|
||||
vfprintf(stderr, fmt, arg);
|
||||
va_end(arg);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
exit(1);
|
||||
}
|
@ -1,20 +1,21 @@
|
||||
/*
|
||||
* The authors of this software are Rob Pike and Ken Thompson.
|
||||
* Copyright (c) 2002 by Lucent Technologies.
|
||||
* Portions Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
* Permission to use, copy, modify, and distribute this software for any
|
||||
* purpose without fee is hereby granted, provided that this entire notice
|
||||
* is included in all copies of any software which is or includes a copy
|
||||
* or modification of this software and in all copies of the supporting
|
||||
* documentation for such software.
|
||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include "plan9.h"
|
||||
#include "utf.h"
|
||||
#include "utfdef.h"
|
||||
|
||||
enum
|
||||
{
|
||||
@ -23,27 +24,150 @@ enum
|
||||
Bit2 = 5,
|
||||
Bit3 = 4,
|
||||
Bit4 = 3,
|
||||
Bit5 = 2,
|
||||
|
||||
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
||||
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
||||
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
||||
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
||||
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
||||
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
|
||||
|
||||
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
||||
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
||||
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
||||
Rune4 = (1<<(Bit4+3*Bitx))-1,
|
||||
/* 0001 1111 1111 1111 1111 1111 */
|
||||
|
||||
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
||||
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
||||
|
||||
Bad = Runeerror
|
||||
Bad = Runeerror,
|
||||
};
|
||||
|
||||
/*
|
||||
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
|
||||
* This is a slower but "safe" version of the old chartorune
|
||||
* that works on strings that are not necessarily null-terminated.
|
||||
*
|
||||
* If you know for sure that your string is null-terminated,
|
||||
* chartorune will be a bit faster.
|
||||
*
|
||||
* It is guaranteed not to attempt to access "length"
|
||||
* past the incoming pointer. This is to avoid
|
||||
* possible access violations. If the string appears to be
|
||||
* well-formed but incomplete (i.e., to get the whole Rune
|
||||
* we'd need to read past str+length) then we'll set the Rune
|
||||
* to Bad and return 0.
|
||||
*
|
||||
* Note that if we have decoding problems for other
|
||||
* reasons, we return 1 instead of 0.
|
||||
*/
|
||||
int
|
||||
chartorune(Rune *rune, char *str)
|
||||
charntorune(Rune *rune, const char *str, int length)
|
||||
{
|
||||
int c, c1, c2;
|
||||
int c, c1, c2, c3;
|
||||
long l;
|
||||
|
||||
/* When we're not allowed to read anything */
|
||||
if(length <= 0) {
|
||||
goto badlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* one character sequence (7-bit value)
|
||||
* 00000-0007F => T1
|
||||
*/
|
||||
c = *(uchar*)str;
|
||||
if(c < Tx) {
|
||||
*rune = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// If we can't read more than one character we must stop
|
||||
if(length <= 1) {
|
||||
goto badlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* two character sequence (11-bit value)
|
||||
* 0080-07FF => T2 Tx
|
||||
*/
|
||||
c1 = *(uchar*)(str+1) ^ Tx;
|
||||
if(c1 & Testx)
|
||||
goto bad;
|
||||
if(c < T3) {
|
||||
if(c < T2)
|
||||
goto bad;
|
||||
l = ((c << Bitx) | c1) & Rune2;
|
||||
if(l <= Rune1)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 2;
|
||||
}
|
||||
|
||||
// If we can't read more than two characters we must stop
|
||||
if(length <= 2) {
|
||||
goto badlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* three character sequence (16-bit value)
|
||||
* 0800-FFFF => T3 Tx Tx
|
||||
*/
|
||||
c2 = *(uchar*)(str+2) ^ Tx;
|
||||
if(c2 & Testx)
|
||||
goto bad;
|
||||
if(c < T4) {
|
||||
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
||||
if(l <= Rune2)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (length <= 3)
|
||||
goto badlen;
|
||||
|
||||
/*
|
||||
* four character sequence (21-bit value)
|
||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||
*/
|
||||
c3 = *(uchar*)(str+3) ^ Tx;
|
||||
if (c3 & Testx)
|
||||
goto bad;
|
||||
if (c < T5) {
|
||||
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||
if (l <= Rune3)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 4;
|
||||
}
|
||||
|
||||
// Support for 5-byte or longer UTF-8 would go here, but
|
||||
// since we don't have that, we'll just fall through to bad.
|
||||
|
||||
/*
|
||||
* bad decoding
|
||||
*/
|
||||
bad:
|
||||
*rune = Bad;
|
||||
return 1;
|
||||
badlen:
|
||||
*rune = Bad;
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This is the older "unsafe" version, which works fine on
|
||||
* null-terminated strings.
|
||||
*/
|
||||
int
|
||||
chartorune(Rune *rune, const char *str)
|
||||
{
|
||||
int c, c1, c2, c3;
|
||||
long l;
|
||||
|
||||
/*
|
||||
@ -88,6 +212,26 @@ chartorune(Rune *rune, char *str)
|
||||
return 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* four character sequence (21-bit value)
|
||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||
*/
|
||||
c3 = *(uchar*)(str+3) ^ Tx;
|
||||
if (c3 & Testx)
|
||||
goto bad;
|
||||
if (c < T5) {
|
||||
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||
if (l <= Rune3)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 4;
|
||||
}
|
||||
|
||||
/*
|
||||
* Support for 5-byte or longer UTF-8 would go here, but
|
||||
* since we don't have that, we'll just fall through to bad.
|
||||
*/
|
||||
|
||||
/*
|
||||
* bad decoding
|
||||
*/
|
||||
@ -97,9 +241,16 @@ bad:
|
||||
}
|
||||
|
||||
int
|
||||
runetochar(char *str, Rune *rune)
|
||||
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
|
||||
*consumed = charntorune(rune, str, length);
|
||||
return *rune != Runeerror || *consumed == 3;
|
||||
}
|
||||
|
||||
int
|
||||
runetochar(char *str, const Rune *rune)
|
||||
{
|
||||
long c;
|
||||
/* Runes are signed, so convert to unsigned for range check. */
|
||||
unsigned long c;
|
||||
|
||||
/*
|
||||
* one character sequence
|
||||
@ -121,57 +272,80 @@ runetochar(char *str, Rune *rune)
|
||||
return 2;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the Rune is out of range, convert it to the error rune.
|
||||
* Do this test here because the error rune encodes to three bytes.
|
||||
* Doing it earlier would duplicate work, since an out of range
|
||||
* Rune wouldn't have fit in one or two bytes.
|
||||
*/
|
||||
if (c > Runemax)
|
||||
c = Runeerror;
|
||||
|
||||
/*
|
||||
* three character sequence
|
||||
* 0800-FFFF => T3 Tx Tx
|
||||
*/
|
||||
str[0] = T3 | (c >> 2*Bitx);
|
||||
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||
str[2] = Tx | (c & Maskx);
|
||||
return 3;
|
||||
if (c <= Rune3) {
|
||||
str[0] = T3 | (c >> 2*Bitx);
|
||||
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||
str[2] = Tx | (c & Maskx);
|
||||
return 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* four character sequence (21-bit value)
|
||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||
*/
|
||||
str[0] = T4 | (c >> 3*Bitx);
|
||||
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
||||
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||
str[3] = Tx | (c & Maskx);
|
||||
return 4;
|
||||
}
|
||||
|
||||
int
|
||||
runelen(long c)
|
||||
runelen(Rune rune)
|
||||
{
|
||||
Rune rune;
|
||||
char str[10];
|
||||
|
||||
rune = c;
|
||||
return runetochar(str, &rune);
|
||||
}
|
||||
|
||||
int
|
||||
runenlen(Rune *r, int nrune)
|
||||
runenlen(const Rune *r, int nrune)
|
||||
{
|
||||
int nb, c;
|
||||
|
||||
nb = 0;
|
||||
while(nrune--) {
|
||||
c = *r++;
|
||||
if(c <= Rune1)
|
||||
if (c <= Rune1)
|
||||
nb++;
|
||||
else
|
||||
if(c <= Rune2)
|
||||
else if (c <= Rune2)
|
||||
nb += 2;
|
||||
else
|
||||
else if (c <= Rune3)
|
||||
nb += 3;
|
||||
else /* assert(c <= Rune4) */
|
||||
nb += 4;
|
||||
}
|
||||
return nb;
|
||||
}
|
||||
|
||||
int
|
||||
fullrune(char *str, int n)
|
||||
fullrune(const char *str, int n)
|
||||
{
|
||||
int c;
|
||||
|
||||
if(n > 0) {
|
||||
c = *(uchar*)str;
|
||||
if(c < Tx)
|
||||
if (n > 0) {
|
||||
int c = *(uchar*)str;
|
||||
if (c < Tx)
|
||||
return 1;
|
||||
if(n > 1)
|
||||
if(c < T3 || n > 2)
|
||||
if (n > 1) {
|
||||
if (c < T3)
|
||||
return 1;
|
||||
if (n > 2) {
|
||||
if (c < T4 || n > 3)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
248
src/lib9/utf/utf.h
Normal file
248
src/lib9/utf/utf.h
Normal file
@ -0,0 +1,248 @@
|
||||
/*
|
||||
* The authors of this software are Rob Pike and Ken Thompson.
|
||||
* Copyright (c) 1998-2002 by Lucent Technologies.
|
||||
* Portions Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
* Permission to use, copy, modify, and distribute this software for any
|
||||
* purpose without fee is hereby granted, provided that this entire notice
|
||||
* is included in all copies of any software which is or includes a copy
|
||||
* or modification of this software and in all copies of the supporting
|
||||
* documentation for such software.
|
||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
|
||||
#ifndef _UTFH_
|
||||
#define _UTFH_ 1
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
|
||||
|
||||
enum
|
||||
{
|
||||
UTFmax = 4, /* maximum bytes per rune */
|
||||
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
|
||||
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
||||
Runeerror = 0xFFFD, /* decoding error in UTF */
|
||||
Runemax = 0x10FFFF, /* maximum rune value */
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* rune routines
|
||||
*/
|
||||
|
||||
/*
|
||||
* These routines were written by Rob Pike and Ken Thompson
|
||||
* and first appeared in Plan 9.
|
||||
* SEE ALSO
|
||||
* utf (7)
|
||||
* tcs (1)
|
||||
*/
|
||||
|
||||
// runetochar copies (encodes) one rune, pointed to by r, to at most
|
||||
// UTFmax bytes starting at s and returns the number of bytes generated.
|
||||
|
||||
int runetochar(char* s, const Rune* r);
|
||||
|
||||
|
||||
// chartorune copies (decodes) at most UTFmax bytes starting at s to
|
||||
// one rune, pointed to by r, and returns the number of bytes consumed.
|
||||
// If the input is not exactly in UTF format, chartorune will set *r
|
||||
// to Runeerror and return 1.
|
||||
//
|
||||
// Note: There is no special case for a "null-terminated" string. A
|
||||
// string whose first byte has the value 0 is the UTF8 encoding of the
|
||||
// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
|
||||
// anywhere else in a UTF sequence.
|
||||
|
||||
int chartorune(Rune* r, const char* s);
|
||||
|
||||
|
||||
// charntorune is like chartorune, except that it will access at most
|
||||
// n bytes of s. If the UTF sequence is incomplete within n bytes,
|
||||
// charntorune will set *r to Runeerror and return 0. If it is complete
|
||||
// but not in UTF format, it will set *r to Runeerror and return 1.
|
||||
//
|
||||
// Added 2004-09-24 by Wei-Hwa Huang
|
||||
|
||||
int charntorune(Rune* r, const char* s, int n);
|
||||
|
||||
// isvalidcharntorune(str, n, r, consumed)
|
||||
// is a convenience function that calls "*consumed = charntorune(r, str, n)"
|
||||
// and returns an int (logically boolean) indicating whether the first
|
||||
// n bytes of str was a valid and complete UTF sequence.
|
||||
|
||||
int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
|
||||
|
||||
// runelen returns the number of bytes required to convert r into UTF.
|
||||
|
||||
int runelen(Rune r);
|
||||
|
||||
|
||||
// runenlen returns the number of bytes required to convert the n
|
||||
// runes pointed to by r into UTF.
|
||||
|
||||
int runenlen(const Rune* r, int n);
|
||||
|
||||
|
||||
// fullrune returns 1 if the string s of length n is long enough to be
|
||||
// decoded by chartorune, and 0 otherwise. This does not guarantee
|
||||
// that the string contains a legal UTF encoding. This routine is used
|
||||
// by programs that obtain input one byte at a time and need to know
|
||||
// when a full rune has arrived.
|
||||
|
||||
int fullrune(const char* s, int n);
|
||||
|
||||
// The following routines are analogous to the corresponding string
|
||||
// routines with "utf" substituted for "str", and "rune" substituted
|
||||
// for "chr".
|
||||
|
||||
// utflen returns the number of runes that are represented by the UTF
|
||||
// string s. (cf. strlen)
|
||||
|
||||
int utflen(const char* s);
|
||||
|
||||
|
||||
// utfnlen returns the number of complete runes that are represented
|
||||
// by the first n bytes of the UTF string s. If the last few bytes of
|
||||
// the string contain an incompletely coded rune, utfnlen will not
|
||||
// count them; in this way, it differs from utflen, which includes
|
||||
// every byte of the string. (cf. strnlen)
|
||||
|
||||
int utfnlen(const char* s, long n);
|
||||
|
||||
|
||||
// utfrune returns a pointer to the first occurrence of rune r in the
|
||||
// UTF string s, or 0 if r does not occur in the string. The NULL
|
||||
// byte terminating a string is considered to be part of the string s.
|
||||
// (cf. strchr)
|
||||
|
||||
const char* utfrune(const char* s, Rune r);
|
||||
|
||||
|
||||
// utfrrune returns a pointer to the last occurrence of rune r in the
|
||||
// UTF string s, or 0 if r does not occur in the string. The NULL
|
||||
// byte terminating a string is considered to be part of the string s.
|
||||
// (cf. strrchr)
|
||||
|
||||
const char* utfrrune(const char* s, Rune r);
|
||||
|
||||
|
||||
// utfutf returns a pointer to the first occurrence of the UTF string
|
||||
// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
|
||||
// null string, utfutf returns s1. (cf. strstr)
|
||||
|
||||
const char* utfutf(const char* s1, const char* s2);
|
||||
|
||||
|
||||
// utfecpy copies UTF sequences until a null sequence has been copied,
|
||||
// but writes no sequences beyond es1. If any sequences are copied,
|
||||
// s1 is terminated by a null sequence, and a pointer to that sequence
|
||||
// is returned. Otherwise, the original s1 is returned. (cf. strecpy)
|
||||
|
||||
char* utfecpy(char *s1, char *es1, const char *s2);
|
||||
|
||||
|
||||
|
||||
// These functions are rune-string analogues of the corresponding
|
||||
// functions in strcat (3).
|
||||
//
|
||||
// These routines first appeared in Plan 9.
|
||||
// SEE ALSO
|
||||
// memmove (3)
|
||||
// rune (3)
|
||||
// strcat (2)
|
||||
//
|
||||
// BUGS: The outcome of overlapping moves varies among implementations.
|
||||
|
||||
Rune* runestrcat(Rune* s1, const Rune* s2);
|
||||
Rune* runestrncat(Rune* s1, const Rune* s2, long n);
|
||||
|
||||
const Rune* runestrchr(const Rune* s, Rune c);
|
||||
|
||||
int runestrcmp(const Rune* s1, const Rune* s2);
|
||||
int runestrncmp(const Rune* s1, const Rune* s2, long n);
|
||||
|
||||
Rune* runestrcpy(Rune* s1, const Rune* s2);
|
||||
Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
|
||||
Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
|
||||
|
||||
Rune* runestrdup(const Rune* s);
|
||||
|
||||
const Rune* runestrrchr(const Rune* s, Rune c);
|
||||
long runestrlen(const Rune* s);
|
||||
const Rune* runestrstr(const Rune* s1, const Rune* s2);
|
||||
|
||||
|
||||
|
||||
// The following routines test types and modify cases for Unicode
|
||||
// characters. Unicode defines some characters as letters and
|
||||
// specifies three cases: upper, lower, and title. Mappings among the
|
||||
// cases are also defined, although they are not exhaustive: some
|
||||
// upper case letters have no lower case mapping, and so on. Unicode
|
||||
// also defines several character properties, a subset of which are
|
||||
// checked by these routines. These routines are based on Unicode
|
||||
// version 3.0.0.
|
||||
//
|
||||
// NOTE: The routines are implemented in C, so the boolean functions
|
||||
// (e.g., isupperrune) return 0 for false and 1 for true.
|
||||
//
|
||||
//
|
||||
// toupperrune, tolowerrune, and totitlerune are the Unicode case
|
||||
// mappings. These routines return the character unchanged if it has
|
||||
// no defined mapping.
|
||||
|
||||
Rune toupperrune(Rune r);
|
||||
Rune tolowerrune(Rune r);
|
||||
Rune totitlerune(Rune r);
|
||||
|
||||
|
||||
// isupperrune tests for upper case characters, including Unicode
|
||||
// upper case letters and targets of the toupper mapping. islowerrune
|
||||
// and istitlerune are defined analogously.
|
||||
|
||||
int isupperrune(Rune r);
|
||||
int islowerrune(Rune r);
|
||||
int istitlerune(Rune r);
|
||||
|
||||
|
||||
// isalpharune tests for Unicode letters; this includes ideographs in
|
||||
// addition to alphabetic characters.
|
||||
|
||||
int isalpharune(Rune r);
|
||||
|
||||
|
||||
// isdigitrune tests for digits. Non-digit numbers, such as Roman
|
||||
// numerals, are not included.
|
||||
|
||||
int isdigitrune(Rune r);
|
||||
|
||||
|
||||
// isideographicrune tests for ideographic characters and numbers, as
|
||||
// defined by the Unicode standard.
|
||||
|
||||
int isideographicrune(Rune r);
|
||||
|
||||
|
||||
// isspacerune tests for whitespace characters, including "C" locale
|
||||
// whitespace, Unicode defined whitespace, and the "zero-width
|
||||
// non-break space" character.
|
||||
|
||||
int isspacerune(Rune r);
|
||||
|
||||
|
||||
// (The comments in this file were copied from the manpage files rune.3,
|
||||
// isalpharune.3, and runestrcat.3. Some formatting changes were also made
|
||||
// to conform to Google style. /JRM 11/11/05)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@ -12,36 +12,17 @@
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* compiler directive on Plan 9
|
||||
*/
|
||||
#ifndef USED
|
||||
#define USED(x) if(x);else
|
||||
#endif
|
||||
#define uchar _utfuchar
|
||||
#define ushort _utfushort
|
||||
#define uint _utfuint
|
||||
#define ulong _utfulong
|
||||
#define vlong _utfvlong
|
||||
#define uvlong _utfuvlong
|
||||
|
||||
/*
|
||||
* easiest way to make sure these are defined
|
||||
*/
|
||||
#define uchar _fmtuchar
|
||||
#define ushort _fmtushort
|
||||
#define uint _fmtuint
|
||||
#define ulong _fmtulong
|
||||
#define vlong _fmtvlong
|
||||
#define uvlong _fmtuvlong
|
||||
typedef unsigned char uchar;
|
||||
typedef unsigned short ushort;
|
||||
typedef unsigned int uint;
|
||||
typedef unsigned long ulong;
|
||||
typedef unsigned long long uvlong;
|
||||
typedef long long vlong;
|
||||
|
||||
/*
|
||||
* nil cannot be ((void*)0) on ANSI C,
|
||||
* because it is used for function pointers
|
||||
*/
|
||||
#undef nil
|
||||
#define nil 0
|
||||
|
||||
#undef nelem
|
||||
#define nelem ((void*)0)
|
||||
|
||||
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
|
||||
#define nil ((void*)0)
|
||||
|
@ -7,18 +7,17 @@
|
||||
* or modification of this software and in all copies of the supporting
|
||||
* documentation for such software.
|
||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
#define _BSD_SOURCE 1 /* memccpy */
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include "plan9.h"
|
||||
#include "utf.h"
|
||||
#include "utfdef.h"
|
||||
|
||||
char*
|
||||
utfecpy(char *to, char *e, char *from)
|
||||
utfecpy(char *to, char *e, const char *from)
|
||||
{
|
||||
char *end;
|
||||
|
||||
|
@ -7,17 +7,17 @@
|
||||
* or modification of this software and in all copies of the supporting
|
||||
* documentation for such software.
|
||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include "plan9.h"
|
||||
#include "utf.h"
|
||||
#include "utfdef.h"
|
||||
|
||||
int
|
||||
utflen(char *s)
|
||||
utflen(const char *s)
|
||||
{
|
||||
int c;
|
||||
long n;
|
||||
@ -34,4 +34,5 @@ utflen(char *s)
|
||||
s += chartorune(&rune, s);
|
||||
n++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -7,22 +7,22 @@
|
||||
* or modification of this software and in all copies of the supporting
|
||||
* documentation for such software.
|
||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include "plan9.h"
|
||||
#include "utf.h"
|
||||
#include "utfdef.h"
|
||||
|
||||
int
|
||||
utfnlen(char *s, long m)
|
||||
utfnlen(const char *s, long m)
|
||||
{
|
||||
int c;
|
||||
long n;
|
||||
Rune rune;
|
||||
char *es;
|
||||
const char *es;
|
||||
|
||||
es = s + m;
|
||||
for(n = 0; s < es; n++) {
|
||||
|
@ -7,21 +7,22 @@
|
||||
* or modification of this software and in all copies of the supporting
|
||||
* documentation for such software.
|
||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include "plan9.h"
|
||||
#include "utf.h"
|
||||
#include "utfdef.h"
|
||||
|
||||
const
|
||||
char*
|
||||
utfrrune(char *s, long c)
|
||||
utfrrune(const char *s, Rune c)
|
||||
{
|
||||
long c1;
|
||||
Rune r;
|
||||
char *s1;
|
||||
const char *s1;
|
||||
|
||||
if(c < Runesync) /* not part of utf sequence */
|
||||
return strrchr(s, c);
|
||||
@ -42,4 +43,5 @@ utfrrune(char *s, long c)
|
||||
s1 = s;
|
||||
s += c1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -7,17 +7,18 @@
|
||||
* or modification of this software and in all copies of the supporting
|
||||
* documentation for such software.
|
||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include "plan9.h"
|
||||
#include "utf.h"
|
||||
#include "utfdef.h"
|
||||
|
||||
const
|
||||
char*
|
||||
utfrune(char *s, long c)
|
||||
utfrune(const char *s, Rune c)
|
||||
{
|
||||
long c1;
|
||||
Rune r;
|
||||
@ -41,4 +42,5 @@ utfrune(char *s, long c)
|
||||
return s;
|
||||
s += n;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -7,24 +7,25 @@
|
||||
* or modification of this software and in all copies of the supporting
|
||||
* documentation for such software.
|
||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
|
||||
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include "plan9.h"
|
||||
#include "utf.h"
|
||||
#include "utfdef.h"
|
||||
|
||||
|
||||
/*
|
||||
* Return pointer to first occurrence of s2 in s1,
|
||||
* 0 if none
|
||||
*/
|
||||
const
|
||||
char*
|
||||
utfutf(char *s1, char *s2)
|
||||
utfutf(const char *s1, const char *s2)
|
||||
{
|
||||
char *p;
|
||||
const char *p;
|
||||
long f, n1, n2;
|
||||
Rune r;
|
||||
|
||||
@ -34,7 +35,7 @@ utfutf(char *s1, char *s2)
|
||||
return strstr(s1, s2);
|
||||
|
||||
n2 = strlen(s2);
|
||||
for(p=s1; p=utfrune(p, f); p+=n1)
|
||||
for(p=s1; (p=utfrune(p, f)) != 0; p+=n1)
|
||||
if(strncmp(p, s2, n2) == 0)
|
||||
return p;
|
||||
return 0;
|
||||
|
@ -20,6 +20,7 @@ LIBOFILES=\
|
||||
runtime.$O\
|
||||
map.$O\
|
||||
print.$O\
|
||||
rune.$O\
|
||||
string.$O\
|
||||
sys_file.$O\
|
||||
|
||||
|
224
src/runtime/rune.c
Normal file
224
src/runtime/rune.c
Normal file
@ -0,0 +1,224 @@
|
||||
/*
|
||||
* The authors of this software are Rob Pike and Ken Thompson.
|
||||
* Copyright (c) 2002 by Lucent Technologies.
|
||||
* Permission to use, copy, modify, and distribute this software for any
|
||||
* purpose without fee is hereby granted, provided that this entire notice
|
||||
* is included in all copies of any software which is or includes a copy
|
||||
* or modification of this software and in all copies of the supporting
|
||||
* documentation for such software.
|
||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This code is copied, with slight editing due to type differences,
|
||||
* from a subset of ../lib9/utf/rune.c
|
||||
*/
|
||||
|
||||
#include "runtime.h"
|
||||
|
||||
enum
|
||||
{
|
||||
Bit1 = 7,
|
||||
Bitx = 6,
|
||||
Bit2 = 5,
|
||||
Bit3 = 4,
|
||||
Bit4 = 3,
|
||||
Bit5 = 2,
|
||||
|
||||
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
||||
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
||||
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
||||
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
||||
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
||||
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
|
||||
|
||||
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
||||
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
||||
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
||||
Rune4 = (1<<(Bit4+3*Bitx))-1,
|
||||
/* 0001 1111 1111 1111 1111 1111 */
|
||||
|
||||
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
||||
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
||||
|
||||
Runeerror = 0xFFFD,
|
||||
Runeself = 0x80,
|
||||
|
||||
Bad = Runeerror,
|
||||
|
||||
Runemax = 0x10FFFF, /* maximum rune value */
|
||||
};
|
||||
|
||||
/*
|
||||
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
|
||||
* This is a slower but "safe" version of the old chartorune
|
||||
* that works on strings that are not necessarily null-terminated.
|
||||
*
|
||||
* If you know for sure that your string is null-terminated,
|
||||
* chartorune will be a bit faster.
|
||||
*
|
||||
* It is guaranteed not to attempt to access "length"
|
||||
* past the incoming pointer. This is to avoid
|
||||
* possible access violations. If the string appears to be
|
||||
* well-formed but incomplete (i.e., to get the whole Rune
|
||||
* we'd need to read past str+length) then we'll set the Rune
|
||||
* to Bad and return 0.
|
||||
*
|
||||
* Note that if we have decoding problems for other
|
||||
* reasons, we return 1 instead of 0.
|
||||
*/
|
||||
int32
|
||||
charntorune(int32 *rune, byte *str, int32 length)
|
||||
{
|
||||
int32 c, c1, c2, c3;
|
||||
int32 l;
|
||||
|
||||
/* When we're not allowed to read anything */
|
||||
if(length <= 0) {
|
||||
goto badlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* one character sequence (7-bit value)
|
||||
* 00000-0007F => T1
|
||||
*/
|
||||
c = *(byte*)str; /* cast not necessary, but kept for safety */
|
||||
if(c < Tx) {
|
||||
*rune = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// If we can't read more than one character we must stop
|
||||
if(length <= 1) {
|
||||
goto badlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* two character sequence (11-bit value)
|
||||
* 0080-07FF => T2 Tx
|
||||
*/
|
||||
c1 = *(byte*)(str+1) ^ Tx;
|
||||
if(c1 & Testx)
|
||||
goto bad;
|
||||
if(c < T3) {
|
||||
if(c < T2)
|
||||
goto bad;
|
||||
l = ((c << Bitx) | c1) & Rune2;
|
||||
if(l <= Rune1)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 2;
|
||||
}
|
||||
|
||||
// If we can't read more than two characters we must stop
|
||||
if(length <= 2) {
|
||||
goto badlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* three character sequence (16-bit value)
|
||||
* 0800-FFFF => T3 Tx Tx
|
||||
*/
|
||||
c2 = *(byte*)(str+2) ^ Tx;
|
||||
if(c2 & Testx)
|
||||
goto bad;
|
||||
if(c < T4) {
|
||||
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
||||
if(l <= Rune2)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (length <= 3)
|
||||
goto badlen;
|
||||
|
||||
/*
|
||||
* four character sequence (21-bit value)
|
||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||
*/
|
||||
c3 = *(byte*)(str+3) ^ Tx;
|
||||
if (c3 & Testx)
|
||||
goto bad;
|
||||
if (c < T5) {
|
||||
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||
if (l <= Rune3)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 4;
|
||||
}
|
||||
|
||||
// Support for 5-byte or longer UTF-8 would go here, but
|
||||
// since we don't have that, we'll just fall through to bad.
|
||||
|
||||
/*
|
||||
* bad decoding
|
||||
*/
|
||||
bad:
|
||||
*rune = Bad;
|
||||
return 1;
|
||||
badlen:
|
||||
*rune = Bad;
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
int32
|
||||
runetochar(byte *str, int32 rune) /* note: in original, arg2 was pointer */
|
||||
{
|
||||
/* Runes are signed, so convert to unsigned for range check. */
|
||||
uint32 c;
|
||||
|
||||
/*
|
||||
* one character sequence
|
||||
* 00000-0007F => 00-7F
|
||||
*/
|
||||
c = rune;
|
||||
if(c <= Rune1) {
|
||||
str[0] = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* two character sequence
|
||||
* 0080-07FF => T2 Tx
|
||||
*/
|
||||
if(c <= Rune2) {
|
||||
str[0] = T2 | (c >> 1*Bitx);
|
||||
str[1] = Tx | (c & Maskx);
|
||||
return 2;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the Rune is out of range, convert it to the error rune.
|
||||
* Do this test here because the error rune encodes to three bytes.
|
||||
* Doing it earlier would duplicate work, since an out of range
|
||||
* Rune wouldn't have fit in one or two bytes.
|
||||
*/
|
||||
if (c > Runemax)
|
||||
c = Runeerror;
|
||||
|
||||
/*
|
||||
* three character sequence
|
||||
* 0800-FFFF => T3 Tx Tx
|
||||
*/
|
||||
if (c <= Rune3) {
|
||||
str[0] = T3 | (c >> 2*Bitx);
|
||||
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||
str[2] = Tx | (c & Maskx);
|
||||
return 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* four character sequence (21-bit value)
|
||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||
*/
|
||||
str[0] = T4 | (c >> 3*Bitx);
|
||||
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
||||
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||
str[3] = Tx | (c & Maskx);
|
||||
return 4;
|
||||
}
|
@ -85,6 +85,8 @@ enum
|
||||
int32 strcmp(byte*, byte*);
|
||||
int32 findnull(int8*);
|
||||
void dump(byte*, int32);
|
||||
int32 runetochar(byte*, int32);
|
||||
int32 chartorune(uint32*, byte*);
|
||||
|
||||
extern string emptystring;
|
||||
extern int32 debug;
|
||||
|
@ -151,55 +151,6 @@ sys·indexstring(string s, int32 i, byte b)
|
||||
FLUSH(&b);
|
||||
}
|
||||
|
||||
/*
|
||||
* this is the plan9 runetochar
|
||||
* extended for 36 bits in 7 bytes
|
||||
* note that it truncates to 32 bits
|
||||
* through the argument passing.
|
||||
*/
|
||||
static int32
|
||||
runetochar(byte *str, uint32 c)
|
||||
{
|
||||
int32 i, n;
|
||||
uint32 mask, mark;
|
||||
|
||||
/*
|
||||
* one character in 7 bits
|
||||
*/
|
||||
if(c <= 0x07FUL) {
|
||||
str[0] = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* every new character picks up 5 bits
|
||||
* one less in the first byte and
|
||||
* six more in an extension byte
|
||||
*/
|
||||
mask = 0x7ffUL;
|
||||
mark = 0xC0UL;
|
||||
for(n=1;; n++) {
|
||||
if(c <= mask)
|
||||
break;
|
||||
mask = (mask<<5) | 0x1fUL;
|
||||
mark = (mark>>1) | 0x80UL;
|
||||
}
|
||||
|
||||
/*
|
||||
* lay down the bytes backwards
|
||||
* n is the number of extension bytes
|
||||
* mask is the max codepoint
|
||||
* mark is the zeroth byte indicator
|
||||
*/
|
||||
for(i=n; i>0; i--) {
|
||||
str[i] = 0x80UL | (c&0x3fUL);
|
||||
c >>= 6;
|
||||
}
|
||||
|
||||
str[0] = mark|c;
|
||||
return n+1;
|
||||
}
|
||||
|
||||
void
|
||||
sys·intstring(int64 v, string s)
|
||||
{
|
||||
|
@ -75,5 +75,14 @@ func main() {
|
||||
`\000\123\x00\312\xFE\u0123\ubabe\U0000babe`,
|
||||
"backslashes 2 (backquote)");
|
||||
assert("\\x\\u\\U\\", `\x\u\U\`, "backslash 3 (backquote)");
|
||||
|
||||
// test large runes. perhaps not the most logical place for this test.
|
||||
var r int32;
|
||||
r = 0x10ffff; // largest rune value
|
||||
s = string(r);
|
||||
assert(s, "\xf4\x8f\xbf\xbf", "largest rune");
|
||||
r = 0x10ffff + 1;
|
||||
s = string(r);
|
||||
assert(s, "\xef\xbf\xbd", "too-large rune");
|
||||
sys.exit(ecode);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user