1
0
mirror of https://github.com/golang/go synced 2024-11-22 06:24:38 -07:00

update to Unicode 5

SVN=126184
This commit is contained in:
Rob Pike 2008-07-07 14:07:46 -07:00
parent 0d079a5362
commit 5b904a3bde
24 changed files with 1520 additions and 1256 deletions

View File

@ -34,7 +34,7 @@ asin(arg double)double
sign = true;
}
if arg > 1 {
panic "return sys.NaN()";
return sys.NaN();
}
temp = sqrt(1 - x*x);
@ -54,7 +54,7 @@ func
acos(arg double)double
{
if(arg > 1 || arg < -1) {
panic "return sys.NaN()";
return sys.NaN();
}
return pio2 - asin(arg);
}

View File

@ -40,7 +40,7 @@ exp(arg double) double
return 0.;
}
if arg > maxf {
panic "return sys.Inf(1)"
return sys.Inf(1)
}
x = arg*log2e;

View File

@ -36,7 +36,7 @@ log(arg double) double
var exp int;
if arg <= 0 {
panic "return sys.NaN()";
return sys.NaN();
}
exp,x = sys.frexp(arg);
@ -63,7 +63,7 @@ log10(arg double) double
{
if arg <= 0 {
panic "return sys.NaN()";
return sys.NaN();
}
return log(arg) * ln10o1;
}

View File

@ -5,7 +5,25 @@
package main
import math "math"
//import math "math"
//////////////////
import math "asin"
import math "atan"
import math "atan2"
import math "exp"
import math "fabs"
import math "floor"
import math "fmod"
import math "hypot"
import math "log"
import math "pow"
import math "pow10"
import math "sin"
import math "sinh"
import math "sqrt"
import math "tan"
import math "tanh"
const
(

View File

@ -26,14 +26,14 @@ pow(arg1,arg2 double) double
if arg1 <= 0 {
if(arg1 == 0) {
if arg2 <= 0 {
panic "return sys.NaN()";
return sys.NaN();
}
return 0;
}
temp = floor(arg2);
if temp != arg2 {
panic "return sys.NaN()";
panic sys.NaN();
}
l = long(temp);

View File

@ -48,7 +48,7 @@ sinh(arg double) double
temp = exp(arg)/2;
case arg > 0.5:
// temp = (exp(arg) - exp(-arg))/2;
temp = (exp(arg) - exp(-arg))/2;
default:
argsq = arg*arg;
@ -71,5 +71,5 @@ cosh(arg double) double
if arg > 21 {
return exp(arg)/2;
}
// return (exp(arg) + exp(-arg))/2;
return (exp(arg) + exp(-arg))/2;
}

View File

@ -19,11 +19,10 @@ sqrt(arg double) double
var x, temp double;
var exp, i int;
/* BUG: NO isINF
if sys.isInf(arg, 1) {
return arg;
}
*/
if arg <= 0 {
if arg < 0 {
panic "return sys.NaN()"

View File

@ -62,7 +62,7 @@ tan(arg double) double
if flag {
if(temp == 0) {
panic "return sys.NaN()";
panic sys.NaN();
}
temp = 1/temp;
}

733
src/lib9/utf/mkrunetype.c Normal file
View File

@ -0,0 +1,733 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
* make is(upper|lower|title|space|alpha)rune and
* to(upper|lower|title)rune from a UnicodeData.txt file.
* these can be found at unicode.org
*
* with -c, runs a check of the existing runetype functions vs.
* those extracted from UnicodeData.
*
* with -p, generates tables for pairs of chars, as well as for ranges
* and singletons.
*
* UnicodeData defines 4 fields of interest:
* 1) a category
* 2) an upper case mapping
* 3) a lower case mapping
* 4) a title case mapping
*
* toupper, tolower, and totitle are defined directly from the mapping.
*
* isalpharune(c) is true iff c is a "letter" category
* isupperrune(c) is true iff c is the target of toupperrune,
* or is in the uppercase letter category
* similarly for islowerrune and istitlerune.
* isspacerune is true for space category chars, "C" locale white space chars,
* and two additions:
* 0085 "next line" control char
* feff] "zero-width non-break space"
* isdigitrune is true iff c is a numeric-digit category.
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <libgen.h>
#include "utf.h"
#include "utfdef.h"
enum {
/*
* fields in the unicode data file
*/
FIELD_CODE,
FIELD_NAME,
FIELD_CATEGORY,
FIELD_COMBINING,
FIELD_BIDIR,
FIELD_DECOMP,
FIELD_DECIMAL_DIG,
FIELD_DIG,
FIELD_NUMERIC_VAL,
FIELD_MIRRORED,
FIELD_UNICODE_1_NAME,
FIELD_COMMENT,
FIELD_UPPER,
FIELD_LOWER,
FIELD_TITLE,
NFIELDS,
MAX_LINE = 1024,
TO_OFFSET = 1 << 20,
NRUNES = 1 << 21,
};
#define TO_DELTA(xmapped,x) (TO_OFFSET + (xmapped) - (x))
static char myisspace[NRUNES];
static char myisalpha[NRUNES];
static char myisdigit[NRUNES];
static char myisupper[NRUNES];
static char myislower[NRUNES];
static char myistitle[NRUNES];
static int mytoupper[NRUNES];
static int mytolower[NRUNES];
static int mytotitle[NRUNES];
static void check(void);
static void mktables(char *src, int usepairs);
static void fatal(const char *fmt, ...);
static int mygetfields(char **fields, int nfields, char *str, const char *delim);
static int getunicodeline(FILE *in, char **fields, char *buf);
static int getcode(char *s);
static void
usage(void)
{
fprintf(stderr, "usage: mktables [-cp] <UnicodeData.txt>\n");
exit(1);
}
int
main(int argc, char *argv[]){
FILE *in;
char buf[MAX_LINE], buf2[MAX_LINE];
char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
char *p;
int i, code, last, docheck, usepairs;
docheck = 0;
usepairs = 0;
ARGBEGIN{
case 'c':
docheck = 1;
break;
case 'p':
usepairs = 1;
break;
default:
usage();
}ARGEND
if(argc != 1){
usage();
}
in = fopen(argv[0], "r");
if(in == NULL){
fatal("can't open %s", argv[0]);
}
for(i = 0; i < NRUNES; i++){
mytoupper[i] = i;
mytolower[i] = i;
mytotitle[i] = i;
}
/*
* make sure isspace has all of the "C" locale whitespace chars
*/
myisspace['\t'] = 1;
myisspace['\n'] = 1;
myisspace['\r'] = 1;
myisspace['\f'] = 1;
myisspace['\v'] = 1;
/*
* a couple of other exceptions
*/
myisspace[0x85] = 1; /* control char, "next line" */
myisspace[0xfeff] = 1; /* zero-width non-break space */
last = -1;
while(getunicodeline(in, fields, buf)){
code = getcode(fields[FIELD_CODE]);
if (code >= NRUNES)
fatal("code-point value too big: %x", code);
if(code <= last)
fatal("bad code sequence: %x then %x", last, code);
last = code;
/*
* check for ranges
*/
p = fields[FIELD_CATEGORY];
if(strstr(fields[FIELD_NAME], ", First>") != NULL){
if(!getunicodeline(in, fields2, buf2))
fatal("range start at eof");
if (strstr(fields2[FIELD_NAME], ", Last>") == NULL)
fatal("range start not followed by range end");
last = getcode(fields2[FIELD_CODE]);
if(last <= code)
fatal("range out of sequence: %x then %x", code, last);
if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
fatal("range with mismatched category");
}
/*
* set properties and conversions
*/
for (; code <= last; code++){
if(p[0] == 'L')
myisalpha[code] = 1;
if(p[0] == 'Z')
myisspace[code] = 1;
if(strcmp(p, "Lu") == 0)
myisupper[code] = 1;
if(strcmp(p, "Ll") == 0)
myislower[code] = 1;
if(strcmp(p, "Lt") == 0)
myistitle[code] = 1;
if(strcmp(p, "Nd") == 0)
myisdigit[code] = 1;
/*
* when finding conversions, also need to mark
* upper/lower case, since some chars, like
* "III" (0x2162), aren't defined as letters but have a
* lower case mapping ("iii" (0x2172)).
*/
if(fields[FIELD_UPPER][0] != '\0'){
mytoupper[code] = getcode(fields[FIELD_UPPER]);
}
if(fields[FIELD_LOWER][0] != '\0'){
mytolower[code] = getcode(fields[FIELD_LOWER]);
}
if(fields[FIELD_TITLE][0] != '\0'){
mytotitle[code] = getcode(fields[FIELD_TITLE]);
}
}
}
fclose(in);
/*
* check for codes with no totitle mapping but a toupper mapping.
* these appear in UnicodeData-2.0.14.txt, but are almost certainly
* erroneous.
*/
for(i = 0; i < NRUNES; i++){
if(mytotitle[i] == i
&& mytoupper[i] != i
&& !myistitle[i])
fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]);
}
/*
* make sure isupper[c] is true if for some x toupper[x] == c
* ditto for islower and istitle
*/
for(i = 0; i < NRUNES; i++) {
if(mytoupper[i] != i)
myisupper[mytoupper[i]] = 1;
if(mytolower[i] != i)
myislower[mytolower[i]] = 1;
if(mytotitle[i] != i)
myistitle[mytotitle[i]] = 1;
}
if(docheck){
check();
}else{
mktables(argv[0], usepairs);
}
return 0;
}
/*
* generate a properties array for ranges, clearing those cases covered.
* if force, generate one-entry ranges for singletons.
*/
static int
mkisrange(const char* label, char* prop, int force)
{
int start, stop, some;
/*
* first, the ranges
*/
some = 0;
for(start = 0; start < NRUNES; ) {
if(!prop[start]){
start++;
continue;
}
for(stop = start + 1; stop < NRUNES; stop++){
if(!prop[stop]){
break;
}
prop[stop] = 0;
}
if(force || stop != start + 1){
if(!some){
printf("static Rune __is%sr[] = {\n", label);
some = 1;
}
prop[start] = 0;
printf("\t0x%.4x, 0x%.4x,\n", start, stop - 1);
}
start = stop;
}
if(some)
printf("};\n\n");
return some;
}
/*
* generate a mapping array for pairs with a skip between,
* clearing those entries covered.
*/
static int
mkispair(const char *label, char *prop)
{
int start, stop, some;
some = 0;
for(start = 0; start + 2 < NRUNES; ) {
if(!prop[start]){
start++;
continue;
}
for(stop = start + 2; stop < NRUNES; stop += 2){
if(!prop[stop]){
break;
}
prop[stop] = 0;
}
if(stop != start + 2){
if(!some){
printf("static Rune __is%sp[] = {\n", label);
some = 1;
}
prop[start] = 0;
printf("\t0x%.4x, 0x%.4x,\n", start, stop - 2);
}
start = stop;
}
if(some)
printf("};\n\n");
return some;
}
/*
* generate a properties array for singletons, clearing those cases covered.
*/
static int
mkissingle(const char *label, char *prop)
{
int start, some;
some = 0;
for(start = 0; start < NRUNES; start++) {
if(!prop[start]){
continue;
}
if(!some){
printf("static Rune __is%ss[] = {\n", label);
some = 1;
}
prop[start] = 0;
printf("\t0x%.4x,\n", start);
}
if(some)
printf("};\n\n");
return some;
}
/*
* generate tables and a function for is<label>rune
*/
static void
mkis(const char* label, char* prop, int usepairs)
{
int isr, isp, iss;
isr = mkisrange(label, prop, 0);
isp = 0;
if(usepairs)
isp = mkispair(label, prop);
iss = mkissingle(label, prop);
printf(
"int\n"
"is%srune(Rune c)\n"
"{\n"
" Rune *p;\n"
"\n",
label);
if(isr)
printf(
" p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n"
" if(p && c >= p[0] && c <= p[1])\n"
" return 1;\n",
label, label);
if(isp)
printf(
" p = rbsearch(c, __is%sp, nelem(__is%sp)/2, 2);\n"
" if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
" return 1;\n",
label, label);
if(iss)
printf(
" p = rbsearch(c, __is%ss, nelem(__is%ss), 1);\n"
" if(p && c == p[0])\n"
" return 1;\n",
label, label);
printf(
" return 0;\n"
"}\n"
"\n"
);
}
/*
* generate a mapping array for ranges, clearing those entries covered.
* if force, generate one-entry ranges for singletons.
*/
static int
mktorange(const char* label, int* map, int force)
{
int start, stop, delta, some;
some = 0;
for(start = 0; start < NRUNES; ) {
if(map[start] == start){
start++;
continue;
}
delta = TO_DELTA(map[start], start);
if(delta != (Rune)delta)
fatal("bad map delta %d", delta);
for(stop = start + 1; stop < NRUNES; stop++){
if(TO_DELTA(map[stop], stop) != delta){
break;
}
map[stop] = stop;
}
if(stop != start + 1){
if(!some){
printf("static Rune __to%sr[] = {\n", label);
some = 1;
}
map[start] = start;
printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta);
}
start = stop;
}
if(some)
printf("};\n\n");
return some;
}
/*
* generate a mapping array for pairs with a skip between,
* clearing those entries covered.
*/
static int
mktopair(const char* label, int* map)
{
int start, stop, delta, some;
some = 0;
for(start = 0; start + 2 < NRUNES; ) {
if(map[start] == start){
start++;
continue;
}
delta = TO_DELTA(map[start], start);
if(delta != (Rune)delta)
fatal("bad map delta %d", delta);
for(stop = start + 2; stop < NRUNES; stop += 2){
if(TO_DELTA(map[stop], stop) != delta){
break;
}
map[stop] = stop;
}
if(stop != start + 2){
if(!some){
printf("static Rune __to%sp[] = {\n", label);
some = 1;
}
map[start] = start;
printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta);
}
start = stop;
}
if(some)
printf("};\n\n");
return some;
}
/*
* generate a mapping array for singletons, clearing those entries covered.
*/
static int
mktosingle(const char* label, int* map)
{
int start, delta, some;
some = 0;
for(start = 0; start < NRUNES; start++) {
if(map[start] == start){
continue;
}
delta = TO_DELTA(map[start], start);
if(delta != (Rune)delta)
fatal("bad map delta %d", delta);
if(!some){
printf("static Rune __to%ss[] = {\n", label);
some = 1;
}
map[start] = start;
printf("\t0x%.4x, %d,\n", start, delta);
}
if(some)
printf("};\n\n");
return some;
}
/*
* generate tables and a function for to<label>rune
*/
static void
mkto(const char* label, int* map, int usepairs)
{
int tor, top, tos;
tor = mktorange(label, map, 0);
top = 0;
if(usepairs)
top = mktopair(label, map);
tos = mktosingle(label, map);
printf(
"Rune\n"
"to%srune(Rune c)\n"
"{\n"
" Rune *p;\n"
"\n",
label);
if(tor)
printf(
" p = rbsearch(c, __to%sr, nelem(__to%sr)/3, 3);\n"
" if(p && c >= p[0] && c <= p[1])\n"
" return c + p[2] - %d;\n",
label, label, TO_OFFSET);
if(top)
printf(
" p = rbsearch(c, __to%sp, nelem(__to%sp)/3, 3);\n"
" if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
" return c + p[2] - %d;\n",
label, label, TO_OFFSET);
if(tos)
printf(
" p = rbsearch(c, __to%ss, nelem(__to%ss)/2, 2);\n"
" if(p && c == p[0])\n"
" return c + p[1] - %d;\n",
label, label, TO_OFFSET);
printf(
" return c;\n"
"}\n"
"\n"
);
}
// Make only range tables and a function for is<label>rune.
static void
mkisronly(const char* label, char* prop) {
mkisrange(label, prop, 1);
printf(
"int\n"
"is%srune(Rune c)\n"
"{\n"
" Rune *p;\n"
"\n"
" p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n"
" if(p && c >= p[0] && c <= p[1])\n"
" return 1;\n"
" return 0;\n"
"}\n"
"\n",
label, label, label);
}
/*
* generate the body of runetype.
* assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne);
*/
static void
mktables(char *src, int usepairs)
{
printf("/* generated automatically by mkrunetype.c from %s */\n\n",
basename(src));
/*
* we special case the space and digit tables, since they are assumed
* to be small with several ranges.
*/
mkisronly("space", myisspace);
mkisronly("digit", myisdigit);
mkis("alpha", myisalpha, 0);
mkis("upper", myisupper, usepairs);
mkis("lower", myislower, usepairs);
mkis("title", myistitle, usepairs);
mkto("upper", mytoupper, usepairs);
mkto("lower", mytolower, usepairs);
mkto("title", mytotitle, usepairs);
}
/*
* find differences between the newly generated tables and current runetypes.
*/
static void
check(void)
{
int i;
for(i = 0; i < NRUNES; i++){
if(isdigitrune(i) != myisdigit[i])
fprintf(stderr, "isdigit diff at %x: runetype=%x, unicode=%x\n",
i, isdigitrune(i), myisdigit[i]);
if(isspacerune(i) != myisspace[i])
fprintf(stderr, "isspace diff at %x: runetype=%x, unicode=%x\n",
i, isspacerune(i), myisspace[i]);
if(isupperrune(i) != myisupper[i])
fprintf(stderr, "isupper diff at %x: runetype=%x, unicode=%x\n",
i, isupperrune(i), myisupper[i]);
if(islowerrune(i) != myislower[i])
fprintf(stderr, "islower diff at %x: runetype=%x, unicode=%x\n",
i, islowerrune(i), myislower[i]);
if(isalpharune(i) != myisalpha[i])
fprintf(stderr, "isalpha diff at %x: runetype=%x, unicode=%x\n",
i, isalpharune(i), myisalpha[i]);
if(toupperrune(i) != mytoupper[i])
fprintf(stderr, "toupper diff at %x: runetype=%x, unicode=%x\n",
i, toupperrune(i), mytoupper[i]);
if(tolowerrune(i) != mytolower[i])
fprintf(stderr, "tolower diff at %x: runetype=%x, unicode=%x\n",
i, tolowerrune(i), mytolower[i]);
if(istitlerune(i) != myistitle[i])
fprintf(stderr, "istitle diff at %x: runetype=%x, unicode=%x\n",
i, istitlerune(i), myistitle[i]);
if(totitlerune(i) != mytotitle[i])
fprintf(stderr, "totitle diff at %x: runetype=%x, unicode=%x\n",
i, totitlerune(i), mytotitle[i]);
}
}
static int
mygetfields(char **fields, int nfields, char *str, const char *delim)
{
int nf;
fields[0] = str;
nf = 1;
if(nf >= nfields)
return nf;
for(; *str; str++){
if(strchr(delim, *str) != NULL){
*str = '\0';
fields[nf++] = str + 1;
if(nf >= nfields)
break;
}
}
return nf;
}
static int
getunicodeline(FILE *in, char **fields, char *buf)
{
char *p;
if(fgets(buf, MAX_LINE, in) == NULL)
return 0;
p = strchr(buf, '\n');
if (p == NULL)
fatal("line too long");
*p = '\0';
if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS)
fatal("bad number of fields");
return 1;
}
static int
getcode(char *s)
{
int i, code;
code = 0;
i = 0;
/* Parse a hex number */
while(s[i]) {
code <<= 4;
if(s[i] >= '0' && s[i] <= '9')
code += s[i] - '0';
else if(s[i] >= 'A' && s[i] <= 'F')
code += s[i] - 'A' + 10;
else
fatal("bad code char '%c'", s[i]);
i++;
}
return code;
}
static void
fatal(const char *fmt, ...)
{
va_list arg;
fprintf(stderr, "%s: fatal error: ", argv0);
va_start(arg, fmt);
vfprintf(stderr, fmt, arg);
va_end(arg);
fprintf(stderr, "\n");
exit(1);
}

View File

@ -1,20 +1,21 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Portions Copyright (c) 2009 The Go Authors. All rights reserved.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
enum
{
@ -23,27 +24,150 @@ enum
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1,
/* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
Bad = Runeerror
Bad = Runeerror,
};
/*
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
* This is a slower but "safe" version of the old chartorune
* that works on strings that are not necessarily null-terminated.
*
* If you know for sure that your string is null-terminated,
* chartorune will be a bit faster.
*
* It is guaranteed not to attempt to access "length"
* past the incoming pointer. This is to avoid
* possible access violations. If the string appears to be
* well-formed but incomplete (i.e., to get the whole Rune
* we'd need to read past str+length) then we'll set the Rune
* to Bad and return 0.
*
* Note that if we have decoding problems for other
* reasons, we return 1 instead of 0.
*/
int
chartorune(Rune *rune, char *str)
charntorune(Rune *rune, const char *str, int length)
{
int c, c1, c2;
int c, c1, c2, c3;
long l;
/* When we're not allowed to read anything */
if(length <= 0) {
goto badlen;
}
/*
* one character sequence (7-bit value)
* 00000-0007F => T1
*/
c = *(uchar*)str;
if(c < Tx) {
*rune = c;
return 1;
}
// If we can't read more than one character we must stop
if(length <= 1) {
goto badlen;
}
/*
* two character sequence (11-bit value)
* 0080-07FF => T2 Tx
*/
c1 = *(uchar*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
// If we can't read more than two characters we must stop
if(length <= 2) {
goto badlen;
}
/*
* three character sequence (16-bit value)
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(uchar*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
if (length <= 3)
goto badlen;
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(uchar*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
*rune = l;
return 4;
}
// Support for 5-byte or longer UTF-8 would go here, but
// since we don't have that, we'll just fall through to bad.
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
badlen:
*rune = Bad;
return 0;
}
/*
* This is the older "unsafe" version, which works fine on
* null-terminated strings.
*/
int
chartorune(Rune *rune, const char *str)
{
int c, c1, c2, c3;
long l;
/*
@ -88,6 +212,26 @@ chartorune(Rune *rune, char *str)
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(uchar*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
*rune = l;
return 4;
}
/*
* Support for 5-byte or longer UTF-8 would go here, but
* since we don't have that, we'll just fall through to bad.
*/
/*
* bad decoding
*/
@ -97,9 +241,16 @@ bad:
}
int
runetochar(char *str, Rune *rune)
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
*consumed = charntorune(rune, str, length);
return *rune != Runeerror || *consumed == 3;
}
int
runetochar(char *str, const Rune *rune)
{
long c;
/* Runes are signed, so convert to unsigned for range check. */
unsigned long c;
/*
* one character sequence
@ -121,57 +272,80 @@ runetochar(char *str, Rune *rune)
return 2;
}
/*
* If the Rune is out of range, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
if (c <= Rune3) {
str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
}
int
runelen(long c)
runelen(Rune rune)
{
Rune rune;
char str[10];
rune = c;
return runetochar(str, &rune);
}
int
runenlen(Rune *r, int nrune)
runenlen(const Rune *r, int nrune)
{
int nb, c;
nb = 0;
while(nrune--) {
c = *r++;
if(c <= Rune1)
if (c <= Rune1)
nb++;
else
if(c <= Rune2)
else if (c <= Rune2)
nb += 2;
else
else if (c <= Rune3)
nb += 3;
else /* assert(c <= Rune4) */
nb += 4;
}
return nb;
}
int
fullrune(char *str, int n)
fullrune(const char *str, int n)
{
int c;
if(n > 0) {
c = *(uchar*)str;
if(c < Tx)
if (n > 0) {
int c = *(uchar*)str;
if (c < Tx)
return 1;
if(n > 1)
if(c < T3 || n > 2)
if (n > 1) {
if (c < T3)
return 1;
if (n > 2) {
if (c < T4 || n > 3)
return 1;
}
}
}
return 0;
}

File diff suppressed because it is too large Load Diff

248
src/lib9/utf/utf.h Normal file
View File

@ -0,0 +1,248 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 1998-2002 by Lucent Technologies.
* Portions Copyright (c) 2009 The Go Authors. All rights reserved.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#ifndef _UTFH_
#define _UTFH_ 1
#include <stdint.h>
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
enum
{
UTFmax = 4, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0x10FFFF, /* maximum rune value */
};
#ifdef __cplusplus
extern "C" {
#endif
/*
* rune routines
*/
/*
* These routines were written by Rob Pike and Ken Thompson
* and first appeared in Plan 9.
* SEE ALSO
* utf (7)
* tcs (1)
*/
// runetochar copies (encodes) one rune, pointed to by r, to at most
// UTFmax bytes starting at s and returns the number of bytes generated.
int runetochar(char* s, const Rune* r);
// chartorune copies (decodes) at most UTFmax bytes starting at s to
// one rune, pointed to by r, and returns the number of bytes consumed.
// If the input is not exactly in UTF format, chartorune will set *r
// to Runeerror and return 1.
//
// Note: There is no special case for a "null-terminated" string. A
// string whose first byte has the value 0 is the UTF8 encoding of the
// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
// anywhere else in a UTF sequence.
int chartorune(Rune* r, const char* s);
// charntorune is like chartorune, except that it will access at most
// n bytes of s. If the UTF sequence is incomplete within n bytes,
// charntorune will set *r to Runeerror and return 0. If it is complete
// but not in UTF format, it will set *r to Runeerror and return 1.
//
// Added 2004-09-24 by Wei-Hwa Huang
int charntorune(Rune* r, const char* s, int n);
// isvalidcharntorune(str, n, r, consumed)
// is a convenience function that calls "*consumed = charntorune(r, str, n)"
// and returns an int (logically boolean) indicating whether the first
// n bytes of str was a valid and complete UTF sequence.
int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
// runelen returns the number of bytes required to convert r into UTF.
int runelen(Rune r);
// runenlen returns the number of bytes required to convert the n
// runes pointed to by r into UTF.
int runenlen(const Rune* r, int n);
// fullrune returns 1 if the string s of length n is long enough to be
// decoded by chartorune, and 0 otherwise. This does not guarantee
// that the string contains a legal UTF encoding. This routine is used
// by programs that obtain input one byte at a time and need to know
// when a full rune has arrived.
int fullrune(const char* s, int n);
// The following routines are analogous to the corresponding string
// routines with "utf" substituted for "str", and "rune" substituted
// for "chr".
// utflen returns the number of runes that are represented by the UTF
// string s. (cf. strlen)
int utflen(const char* s);
// utfnlen returns the number of complete runes that are represented
// by the first n bytes of the UTF string s. If the last few bytes of
// the string contain an incompletely coded rune, utfnlen will not
// count them; in this way, it differs from utflen, which includes
// every byte of the string. (cf. strnlen)
int utfnlen(const char* s, long n);
// utfrune returns a pointer to the first occurrence of rune r in the
// UTF string s, or 0 if r does not occur in the string. The NULL
// byte terminating a string is considered to be part of the string s.
// (cf. strchr)
const char* utfrune(const char* s, Rune r);
// utfrrune returns a pointer to the last occurrence of rune r in the
// UTF string s, or 0 if r does not occur in the string. The NULL
// byte terminating a string is considered to be part of the string s.
// (cf. strrchr)
const char* utfrrune(const char* s, Rune r);
// utfutf returns a pointer to the first occurrence of the UTF string
// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
// null string, utfutf returns s1. (cf. strstr)
const char* utfutf(const char* s1, const char* s2);
// utfecpy copies UTF sequences until a null sequence has been copied,
// but writes no sequences beyond es1. If any sequences are copied,
// s1 is terminated by a null sequence, and a pointer to that sequence
// is returned. Otherwise, the original s1 is returned. (cf. strecpy)
char* utfecpy(char *s1, char *es1, const char *s2);
// These functions are rune-string analogues of the corresponding
// functions in strcat (3).
//
// These routines first appeared in Plan 9.
// SEE ALSO
// memmove (3)
// rune (3)
// strcat (2)
//
// BUGS: The outcome of overlapping moves varies among implementations.
Rune* runestrcat(Rune* s1, const Rune* s2);
Rune* runestrncat(Rune* s1, const Rune* s2, long n);
const Rune* runestrchr(const Rune* s, Rune c);
int runestrcmp(const Rune* s1, const Rune* s2);
int runestrncmp(const Rune* s1, const Rune* s2, long n);
Rune* runestrcpy(Rune* s1, const Rune* s2);
Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
Rune* runestrdup(const Rune* s);
const Rune* runestrrchr(const Rune* s, Rune c);
long runestrlen(const Rune* s);
const Rune* runestrstr(const Rune* s1, const Rune* s2);
// The following routines test types and modify cases for Unicode
// characters. Unicode defines some characters as letters and
// specifies three cases: upper, lower, and title. Mappings among the
// cases are also defined, although they are not exhaustive: some
// upper case letters have no lower case mapping, and so on. Unicode
// also defines several character properties, a subset of which are
// checked by these routines. These routines are based on Unicode
// version 3.0.0.
//
// NOTE: The routines are implemented in C, so the boolean functions
// (e.g., isupperrune) return 0 for false and 1 for true.
//
//
// toupperrune, tolowerrune, and totitlerune are the Unicode case
// mappings. These routines return the character unchanged if it has
// no defined mapping.
Rune toupperrune(Rune r);
Rune tolowerrune(Rune r);
Rune totitlerune(Rune r);
// isupperrune tests for upper case characters, including Unicode
// upper case letters and targets of the toupper mapping. islowerrune
// and istitlerune are defined analogously.
int isupperrune(Rune r);
int islowerrune(Rune r);
int istitlerune(Rune r);
// isalpharune tests for Unicode letters; this includes ideographs in
// addition to alphabetic characters.
int isalpharune(Rune r);
// isdigitrune tests for digits. Non-digit numbers, such as Roman
// numerals, are not included.
int isdigitrune(Rune r);
// isideographicrune tests for ideographic characters and numbers, as
// defined by the Unicode standard.
int isideographicrune(Rune r);
// isspacerune tests for whitespace characters, including "C" locale
// whitespace, Unicode defined whitespace, and the "zero-width
// non-break space" character.
int isspacerune(Rune r);
// (The comments in this file were copied from the manpage files rune.3,
// isalpharune.3, and runestrcat.3. Some formatting changes were also made
// to conform to Google style. /JRM 11/11/05)
#ifdef __cplusplus
}
#endif
#endif

View File

@ -12,36 +12,17 @@
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
/*
* compiler directive on Plan 9
*/
#ifndef USED
#define USED(x) if(x);else
#endif
#define uchar _utfuchar
#define ushort _utfushort
#define uint _utfuint
#define ulong _utfulong
#define vlong _utfvlong
#define uvlong _utfuvlong
/*
* easiest way to make sure these are defined
*/
#define uchar _fmtuchar
#define ushort _fmtushort
#define uint _fmtuint
#define ulong _fmtulong
#define vlong _fmtvlong
#define uvlong _fmtuvlong
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned int uint;
typedef unsigned long ulong;
typedef unsigned long long uvlong;
typedef long long vlong;
/*
* nil cannot be ((void*)0) on ANSI C,
* because it is used for function pointers
*/
#undef nil
#define nil 0
#undef nelem
#define nelem ((void*)0)
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
#define nil ((void*)0)

View File

@ -7,18 +7,17 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#define _BSD_SOURCE 1 /* memccpy */
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
char*
utfecpy(char *to, char *e, char *from)
utfecpy(char *to, char *e, const char *from)
{
char *end;

View File

@ -7,17 +7,17 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
int
utflen(char *s)
utflen(const char *s)
{
int c;
long n;
@ -34,4 +34,5 @@ utflen(char *s)
s += chartorune(&rune, s);
n++;
}
return 0;
}

View File

@ -7,22 +7,22 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
int
utfnlen(char *s, long m)
utfnlen(const char *s, long m)
{
int c;
long n;
Rune rune;
char *es;
const char *es;
es = s + m;
for(n = 0; s < es; n++) {

View File

@ -7,21 +7,22 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
const
char*
utfrrune(char *s, long c)
utfrrune(const char *s, Rune c)
{
long c1;
Rune r;
char *s1;
const char *s1;
if(c < Runesync) /* not part of utf sequence */
return strrchr(s, c);
@ -42,4 +43,5 @@ utfrrune(char *s, long c)
s1 = s;
s += c1;
}
return 0;
}

View File

@ -7,17 +7,18 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
const
char*
utfrune(char *s, long c)
utfrune(const char *s, Rune c)
{
long c1;
Rune r;
@ -41,4 +42,5 @@ utfrune(char *s, long c)
return s;
s += n;
}
return 0;
}

View File

@ -7,24 +7,25 @@
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
* ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "plan9.h"
#include "utf.h"
#include "utfdef.h"
/*
* Return pointer to first occurrence of s2 in s1,
* 0 if none
*/
const
char*
utfutf(char *s1, char *s2)
utfutf(const char *s1, const char *s2)
{
char *p;
const char *p;
long f, n1, n2;
Rune r;
@ -34,7 +35,7 @@ utfutf(char *s1, char *s2)
return strstr(s1, s2);
n2 = strlen(s2);
for(p=s1; p=utfrune(p, f); p+=n1)
for(p=s1; (p=utfrune(p, f)) != 0; p+=n1)
if(strncmp(p, s2, n2) == 0)
return p;
return 0;

View File

@ -20,6 +20,7 @@ LIBOFILES=\
runtime.$O\
map.$O\
print.$O\
rune.$O\
string.$O\
sys_file.$O\

224
src/runtime/rune.c Normal file
View File

@ -0,0 +1,224 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
/*
* This code is copied, with slight editing due to type differences,
* from a subset of ../lib9/utf/rune.c
*/
#include "runtime.h"
enum
{
Bit1 = 7,
Bitx = 6,
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1,
/* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
Runeerror = 0xFFFD,
Runeself = 0x80,
Bad = Runeerror,
Runemax = 0x10FFFF, /* maximum rune value */
};
/*
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
* This is a slower but "safe" version of the old chartorune
* that works on strings that are not necessarily null-terminated.
*
* If you know for sure that your string is null-terminated,
* chartorune will be a bit faster.
*
* It is guaranteed not to attempt to access "length"
* past the incoming pointer. This is to avoid
* possible access violations. If the string appears to be
* well-formed but incomplete (i.e., to get the whole Rune
* we'd need to read past str+length) then we'll set the Rune
* to Bad and return 0.
*
* Note that if we have decoding problems for other
* reasons, we return 1 instead of 0.
*/
int32
charntorune(int32 *rune, byte *str, int32 length)
{
int32 c, c1, c2, c3;
int32 l;
/* When we're not allowed to read anything */
if(length <= 0) {
goto badlen;
}
/*
* one character sequence (7-bit value)
* 00000-0007F => T1
*/
c = *(byte*)str; /* cast not necessary, but kept for safety */
if(c < Tx) {
*rune = c;
return 1;
}
// If we can't read more than one character we must stop
if(length <= 1) {
goto badlen;
}
/*
* two character sequence (11-bit value)
* 0080-07FF => T2 Tx
*/
c1 = *(byte*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
// If we can't read more than two characters we must stop
if(length <= 2) {
goto badlen;
}
/*
* three character sequence (16-bit value)
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(byte*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
if (length <= 3)
goto badlen;
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(byte*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
*rune = l;
return 4;
}
// Support for 5-byte or longer UTF-8 would go here, but
// since we don't have that, we'll just fall through to bad.
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
badlen:
*rune = Bad;
return 0;
}
int32
runetochar(byte *str, int32 rune) /* note: in original, arg2 was pointer */
{
/* Runes are signed, so convert to unsigned for range check. */
uint32 c;
/*
* one character sequence
* 00000-0007F => 00-7F
*/
c = rune;
if(c <= Rune1) {
str[0] = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
str[0] = T2 | (c >> 1*Bitx);
str[1] = Tx | (c & Maskx);
return 2;
}
/*
* If the Rune is out of range, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
}

View File

@ -85,6 +85,8 @@ enum
int32 strcmp(byte*, byte*);
int32 findnull(int8*);
void dump(byte*, int32);
int32 runetochar(byte*, int32);
int32 chartorune(uint32*, byte*);
extern string emptystring;
extern int32 debug;

View File

@ -151,55 +151,6 @@ sys·indexstring(string s, int32 i, byte b)
FLUSH(&b);
}
/*
* this is the plan9 runetochar
* extended for 36 bits in 7 bytes
* note that it truncates to 32 bits
* through the argument passing.
*/
static int32
runetochar(byte *str, uint32 c)
{
int32 i, n;
uint32 mask, mark;
/*
* one character in 7 bits
*/
if(c <= 0x07FUL) {
str[0] = c;
return 1;
}
/*
* every new character picks up 5 bits
* one less in the first byte and
* six more in an extension byte
*/
mask = 0x7ffUL;
mark = 0xC0UL;
for(n=1;; n++) {
if(c <= mask)
break;
mask = (mask<<5) | 0x1fUL;
mark = (mark>>1) | 0x80UL;
}
/*
* lay down the bytes backwards
* n is the number of extension bytes
* mask is the max codepoint
* mark is the zeroth byte indicator
*/
for(i=n; i>0; i--) {
str[i] = 0x80UL | (c&0x3fUL);
c >>= 6;
}
str[0] = mark|c;
return n+1;
}
void
sys·intstring(int64 v, string s)
{

View File

@ -75,5 +75,14 @@ func main() {
`\000\123\x00\312\xFE\u0123\ubabe\U0000babe`,
"backslashes 2 (backquote)");
assert("\\x\\u\\U\\", `\x\u\U\`, "backslash 3 (backquote)");
// test large runes. perhaps not the most logical place for this test.
var r int32;
r = 0x10ffff; // largest rune value
s = string(r);
assert(s, "\xf4\x8f\xbf\xbf", "largest rune");
r = 0x10ffff + 1;
s = string(r);
assert(s, "\xef\xbf\xbd", "too-large rune");
sys.exit(ecode);
}