update to Unicode 5

SVN=126184
2024-11-21 23:54:40 -07:00 · 2008-07-07 14:07:46 -07:00 · 2008-07-07 14:07:46 -07:00 · 5b904a3bde
commit 5b904a3bde
parent 0d079a5362
24 changed files with 1520 additions and 1256 deletions
--- a/src/lib/math/asin.go
+++ b/src/lib/math/asin.go
@ -34,7 +34,7 @@ asin(arg double)double
 		sign = true;
 	}
 	if arg > 1 {
-		panic "return sys.NaN()";
+		return sys.NaN();
 	}

 	temp = sqrt(1 - x*x);
@ -54,7 +54,7 @@ func
 acos(arg double)double
 {
 	if(arg > 1 || arg < -1) {
-		panic "return sys.NaN()";
+		return sys.NaN();
 	}
 	return pio2 - asin(arg);
 }
--- a/src/lib/math/exp.go
+++ b/src/lib/math/exp.go
@ -40,7 +40,7 @@ exp(arg double) double
 		return 0.;
 	}
 	if arg > maxf {
-		panic "return sys.Inf(1)"
+		return sys.Inf(1)
 	}

 	x = arg*log2e;
--- a/src/lib/math/log.go
+++ b/src/lib/math/log.go
@ -36,7 +36,7 @@ log(arg double) double
 	var exp int;

 	if arg <= 0 {
-		panic "return sys.NaN()";
+		return sys.NaN();
 	}

 	exp,x = sys.frexp(arg);
@ -63,7 +63,7 @@ log10(arg double) double
 {

 	if arg <= 0 {
-		panic "return sys.NaN()";
+		return sys.NaN();
 	}
 	return log(arg) * ln10o1;
 }
--- a/src/lib/math/main.go
+++ b/src/lib/math/main.go
@ -5,7 +5,25 @@

 package main

-import math "math"
+//import math "math"
+//////////////////
+ import math "asin"
+ import math "atan"
+ import math "atan2"
+ import math "exp"
+ import math "fabs"
+ import math "floor"
+ import math "fmod"
+ import math "hypot"
+ import math "log"
+ import math "pow"
+ import math "pow10"
+ import math "sin"
+ import math "sinh"
+ import math "sqrt"
+ import math "tan"
+ import math "tanh"
+

 const
 (
--- a/src/lib/math/pow.go
+++ b/src/lib/math/pow.go
@ -26,14 +26,14 @@ pow(arg1,arg2 double) double
 	if arg1 <= 0 {
 		if(arg1 == 0) {
 			if arg2 <= 0 {
-				panic "return sys.NaN()";
+				return sys.NaN();
 			}
 			return 0;
 		}

 		temp = floor(arg2);
 		if temp != arg2 {
-			panic "return sys.NaN()";
+			panic sys.NaN();
 		}

 		l = long(temp);
--- a/src/lib/math/sinh.go
+++ b/src/lib/math/sinh.go
@ -48,7 +48,7 @@ sinh(arg double) double
 		temp = exp(arg)/2;

 	case arg > 0.5:
-//		temp = (exp(arg) - exp(-arg))/2;
+		temp = (exp(arg) - exp(-arg))/2;

 	default:
 		argsq = arg*arg;
@ -71,5 +71,5 @@ cosh(arg double) double
 	if arg > 21 {
 		return exp(arg)/2;
 	}
-//	return (exp(arg) + exp(-arg))/2;
+	return (exp(arg) + exp(-arg))/2;
 }
--- a/src/lib/math/sqrt.go
+++ b/src/lib/math/sqrt.go
@ -19,11 +19,10 @@ sqrt(arg double) double
 	var x, temp double;
 	var exp, i int;

-/* BUG: NO isINF
 	if sys.isInf(arg, 1) {
 		return arg;
 	}
-*/
+
 	if arg <= 0 {
 		if arg < 0 {
 			panic "return sys.NaN()"
--- a/src/lib/math/tan.go
+++ b/src/lib/math/tan.go
@ -62,7 +62,7 @@ tan(arg double) double

 	if flag {
 		if(temp == 0) {
-			panic "return sys.NaN()";
+			panic sys.NaN();
 		}
 		temp = 1/temp;
 	}
--- a/src/lib9/utf/mkrunetype.c
+++ b/src/lib9/utf/mkrunetype.c
@ -0,0 +1,733 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * make is(upper|lower|title|space|alpha)rune and
+ * to(upper|lower|title)rune from a UnicodeData.txt file.
+ * these can be found at unicode.org
+ *
+ * with -c, runs a check of the existing runetype functions vs.
+ * those extracted from UnicodeData.
+ *
+ * with -p, generates tables for pairs of chars, as well as for ranges
+ * and singletons.
+ *
+ * UnicodeData defines 4 fields of interest:
+ * 1) a category
+ * 2) an upper case mapping
+ * 3) a lower case mapping
+ * 4) a title case mapping
+ *
+ * toupper, tolower, and totitle are defined directly from the mapping.
+ *
+ * isalpharune(c) is true iff c is a "letter" category
+ * isupperrune(c) is true iff c is the target of toupperrune,
+ *	or is in the uppercase letter category
+ * similarly for islowerrune and istitlerune.
+ * isspacerune is true for space category chars, "C" locale white space chars,
+ *	and two additions:
+ *	0085	"next line" control char
+ *	feff]	"zero-width non-break space"
+ * isdigitrune is true iff c is a numeric-digit category.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <libgen.h>
+#include "utf.h"
+#include "utfdef.h"
+
+enum {
+	/*
+	 * fields in the unicode data file
+	 */
+	FIELD_CODE,
+	FIELD_NAME,
+	FIELD_CATEGORY,
+	FIELD_COMBINING,
+	FIELD_BIDIR,
+	FIELD_DECOMP,
+	FIELD_DECIMAL_DIG,
+	FIELD_DIG,
+	FIELD_NUMERIC_VAL,
+	FIELD_MIRRORED,
+	FIELD_UNICODE_1_NAME,
+	FIELD_COMMENT,
+	FIELD_UPPER,
+	FIELD_LOWER,
+	FIELD_TITLE,
+	NFIELDS,
+
+	MAX_LINE	= 1024,
+
+	TO_OFFSET	= 1 << 20,
+
+	NRUNES		= 1 << 21,
+};
+
+#define TO_DELTA(xmapped,x)	(TO_OFFSET + (xmapped) - (x))
+
+static char	myisspace[NRUNES];
+static char	myisalpha[NRUNES];
+static char	myisdigit[NRUNES];
+static char	myisupper[NRUNES];
+static char	myislower[NRUNES];
+static char	myistitle[NRUNES];
+
+static int	mytoupper[NRUNES];
+static int	mytolower[NRUNES];
+static int	mytotitle[NRUNES];
+
+static void	check(void);
+static void	mktables(char *src, int usepairs);
+static void	fatal(const char *fmt, ...);
+static int	mygetfields(char **fields, int nfields, char *str, const char *delim);
+static int	getunicodeline(FILE *in, char **fields, char *buf);
+static int	getcode(char *s);
+
+static void
+usage(void)
+{
+	fprintf(stderr, "usage: mktables [-cp] <UnicodeData.txt>\n");
+	exit(1);
+}
+
+int
+main(int argc, char *argv[]){
+	FILE *in;
+	char buf[MAX_LINE], buf2[MAX_LINE];
+	char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
+	char *p;
+	int i, code, last, docheck, usepairs;
+
+	docheck = 0;
+	usepairs = 0;
+	ARGBEGIN{
+	case 'c':
+		docheck = 1;
+		break;
+	case 'p':
+		usepairs = 1;
+		break;
+	default:
+		usage();
+	}ARGEND
+
+	if(argc != 1){
+		usage();
+	}
+
+	in = fopen(argv[0], "r");
+	if(in == NULL){
+		fatal("can't open %s", argv[0]);
+	}
+
+	for(i = 0; i < NRUNES; i++){
+		mytoupper[i] = i;
+		mytolower[i] = i;
+		mytotitle[i] = i;
+	}
+
+	/*
+	 * make sure isspace has all of the "C" locale whitespace chars
+	 */
+	myisspace['\t'] = 1;
+	myisspace['\n'] = 1;
+	myisspace['\r'] = 1;
+	myisspace['\f'] = 1;
+	myisspace['\v'] = 1;
+
+	/*
+	 * a couple of other exceptions
+	 */
+	myisspace[0x85] = 1;	/* control char, "next line" */
+	myisspace[0xfeff] = 1;	/* zero-width non-break space */
+
+	last = -1;
+	while(getunicodeline(in, fields, buf)){
+		code = getcode(fields[FIELD_CODE]);
+                if (code >= NRUNES)
+                  fatal("code-point value too big: %x", code);
+		if(code <= last)
+			fatal("bad code sequence: %x then %x", last, code);
+		last = code;
+
+		/*
+		 * check for ranges
+		 */
+		p = fields[FIELD_CATEGORY];
+		if(strstr(fields[FIELD_NAME], ", First>") != NULL){
+			if(!getunicodeline(in, fields2, buf2))
+				fatal("range start at eof");
+			if (strstr(fields2[FIELD_NAME], ", Last>") == NULL)
+				fatal("range start not followed by range end");
+			last = getcode(fields2[FIELD_CODE]);
+			if(last <= code)
+				fatal("range out of sequence: %x then %x", code, last);
+			if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
+				fatal("range with mismatched category");
+		}
+
+		/*
+		 * set properties and conversions
+		 */
+		for (; code <= last; code++){
+			if(p[0] == 'L')
+				myisalpha[code] = 1;
+			if(p[0] == 'Z')
+				myisspace[code] = 1;
+
+			if(strcmp(p, "Lu") == 0)
+				myisupper[code] = 1;
+			if(strcmp(p, "Ll") == 0)
+				myislower[code] = 1;
+
+			if(strcmp(p, "Lt") == 0)
+				myistitle[code] = 1;
+
+			if(strcmp(p, "Nd") == 0)
+				myisdigit[code] = 1;
+
+			/*
+			 * when finding conversions, also need to mark
+			 * upper/lower case, since some chars, like
+			 * "III" (0x2162), aren't defined as letters but have a
+			 * lower case mapping ("iii" (0x2172)).
+			 */
+			if(fields[FIELD_UPPER][0] != '\0'){
+				mytoupper[code] = getcode(fields[FIELD_UPPER]);
+			}
+			if(fields[FIELD_LOWER][0] != '\0'){
+				mytolower[code] = getcode(fields[FIELD_LOWER]);
+			}
+			if(fields[FIELD_TITLE][0] != '\0'){
+				mytotitle[code] = getcode(fields[FIELD_TITLE]);
+			}
+		}
+	}
+
+	fclose(in);
+
+	/*
+	 * check for codes with no totitle mapping but a toupper mapping.
+	 * these appear in UnicodeData-2.0.14.txt, but are almost certainly
+	 * erroneous.
+	 */
+	for(i = 0; i < NRUNES; i++){
+		if(mytotitle[i] == i
+		&& mytoupper[i] != i
+		&& !myistitle[i])
+			fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]);
+	}
+
+	/*
+	 * make sure isupper[c] is true if for some x toupper[x]  == c
+	 * ditto for islower and istitle
+	 */
+	for(i = 0; i < NRUNES; i++) {
+		if(mytoupper[i] != i)
+			myisupper[mytoupper[i]] = 1;
+		if(mytolower[i] != i)
+			myislower[mytolower[i]] = 1;
+		if(mytotitle[i] != i)
+			myistitle[mytotitle[i]] = 1;
+	}
+
+	if(docheck){
+		check();
+	}else{
+		mktables(argv[0], usepairs);
+	}
+	return 0;
+}
+
+/*
+ * generate a properties array for ranges, clearing those cases covered.
+ * if force, generate one-entry ranges for singletons.
+ */
+static int
+mkisrange(const char* label, char* prop, int force)
+{
+	int start, stop, some;
+
+	/*
+	 * first, the ranges
+	 */
+	some = 0;
+	for(start = 0; start < NRUNES; ) {
+		if(!prop[start]){
+			start++;
+			continue;
+		}
+
+		for(stop = start + 1; stop < NRUNES; stop++){
+			if(!prop[stop]){
+				break;
+			}
+			prop[stop] = 0;
+		}
+		if(force || stop != start + 1){
+			if(!some){
+				printf("static Rune __is%sr[] = {\n", label);
+				some = 1;
+			}
+			prop[start] = 0;
+			printf("\t0x%.4x, 0x%.4x,\n", start, stop - 1);
+		}
+
+		start = stop;
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate a mapping array for pairs with a skip between,
+ * clearing those entries covered.
+ */
+static int
+mkispair(const char *label, char *prop)
+{
+	int start, stop, some;
+
+	some = 0;
+	for(start = 0; start + 2 < NRUNES; ) {
+		if(!prop[start]){
+			start++;
+			continue;
+		}
+
+		for(stop = start + 2; stop < NRUNES; stop += 2){
+			if(!prop[stop]){
+				break;
+			}
+			prop[stop] = 0;
+		}
+		if(stop != start + 2){
+			if(!some){
+				printf("static Rune __is%sp[] = {\n", label);
+				some = 1;
+			}
+			prop[start] = 0;
+			printf("\t0x%.4x, 0x%.4x,\n", start, stop - 2);
+		}
+
+		start = stop;
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate a properties array for singletons, clearing those cases covered.
+ */
+static int
+mkissingle(const char *label, char *prop)
+{
+	int start, some;
+
+	some = 0;
+	for(start = 0; start < NRUNES; start++) {
+		if(!prop[start]){
+			continue;
+		}
+
+		if(!some){
+			printf("static Rune __is%ss[] = {\n", label);
+			some = 1;
+		}
+		prop[start] = 0;
+		printf("\t0x%.4x,\n", start);
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate tables and a function for is<label>rune
+ */
+static void
+mkis(const char* label, char* prop, int usepairs)
+{
+	int isr, isp, iss;
+
+	isr = mkisrange(label, prop, 0);
+	isp = 0;
+	if(usepairs)
+		isp = mkispair(label, prop);
+	iss = mkissingle(label, prop);
+
+	printf(
+		"int\n"
+		"is%srune(Rune c)\n"
+		"{\n"
+		"	Rune *p;\n"
+		"\n",
+		label);
+
+	if(isr)
+		printf(
+			"	p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n"
+			"	if(p && c >= p[0] && c <= p[1])\n"
+			"		return 1;\n",
+			label, label);
+
+	if(isp)
+		printf(
+			"	p = rbsearch(c, __is%sp, nelem(__is%sp)/2, 2);\n"
+			"	if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
+			"		return 1;\n",
+			label, label);
+
+	if(iss)
+		printf(
+			"	p = rbsearch(c, __is%ss, nelem(__is%ss), 1);\n"
+			"	if(p && c == p[0])\n"
+			"		return 1;\n",
+			label, label);
+
+
+	printf(
+		"	return 0;\n"
+		"}\n"
+		"\n"
+	);
+}
+
+/*
+ * generate a mapping array for ranges, clearing those entries covered.
+ * if force, generate one-entry ranges for singletons.
+ */
+static int
+mktorange(const char* label, int* map, int force)
+{
+	int start, stop, delta, some;
+
+	some = 0;
+	for(start = 0; start < NRUNES; ) {
+		if(map[start] == start){
+			start++;
+			continue;
+		}
+
+		delta = TO_DELTA(map[start], start);
+		if(delta != (Rune)delta)
+			fatal("bad map delta %d", delta);
+		for(stop = start + 1; stop < NRUNES; stop++){
+			if(TO_DELTA(map[stop], stop) != delta){
+				break;
+			}
+			map[stop] = stop;
+		}
+		if(stop != start + 1){
+			if(!some){
+				printf("static Rune __to%sr[] = {\n", label);
+				some = 1;
+			}
+			map[start] = start;
+			printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta);
+		}
+
+		start = stop;
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate a mapping array for pairs with a skip between,
+ * clearing those entries covered.
+ */
+static int
+mktopair(const char* label, int* map)
+{
+	int start, stop, delta, some;
+
+	some = 0;
+	for(start = 0; start + 2 < NRUNES; ) {
+		if(map[start] == start){
+			start++;
+			continue;
+		}
+
+		delta = TO_DELTA(map[start], start);
+		if(delta != (Rune)delta)
+			fatal("bad map delta %d", delta);
+		for(stop = start + 2; stop < NRUNES; stop += 2){
+			if(TO_DELTA(map[stop], stop) != delta){
+				break;
+			}
+			map[stop] = stop;
+		}
+		if(stop != start + 2){
+			if(!some){
+				printf("static Rune __to%sp[] = {\n", label);
+				some = 1;
+			}
+			map[start] = start;
+			printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta);
+		}
+
+		start = stop;
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate a mapping array for singletons, clearing those entries covered.
+ */
+static int
+mktosingle(const char* label, int* map)
+{
+	int start, delta, some;
+
+	some = 0;
+	for(start = 0; start < NRUNES; start++) {
+		if(map[start] == start){
+			continue;
+		}
+
+		delta = TO_DELTA(map[start], start);
+		if(delta != (Rune)delta)
+			fatal("bad map delta %d", delta);
+		if(!some){
+			printf("static Rune __to%ss[] = {\n", label);
+			some = 1;
+		}
+		map[start] = start;
+		printf("\t0x%.4x, %d,\n", start, delta);
+	}
+	if(some)
+		printf("};\n\n");
+	return some;
+}
+
+/*
+ * generate tables and a function for to<label>rune
+ */
+static void
+mkto(const char* label, int* map, int usepairs)
+{
+	int tor, top, tos;
+
+	tor = mktorange(label, map, 0);
+	top = 0;
+	if(usepairs)
+		top = mktopair(label, map);
+	tos = mktosingle(label, map);
+
+	printf(
+		"Rune\n"
+		"to%srune(Rune c)\n"
+		"{\n"
+		"	Rune *p;\n"
+		"\n",
+		label);
+
+	if(tor)
+		printf(
+			"	p = rbsearch(c, __to%sr, nelem(__to%sr)/3, 3);\n"
+			"	if(p && c >= p[0] && c <= p[1])\n"
+			"		return c + p[2] - %d;\n",
+			label, label, TO_OFFSET);
+
+	if(top)
+		printf(
+			"	p = rbsearch(c, __to%sp, nelem(__to%sp)/3, 3);\n"
+			"	if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
+			"		return c + p[2] - %d;\n",
+			label, label, TO_OFFSET);
+
+	if(tos)
+		printf(
+			"	p = rbsearch(c, __to%ss, nelem(__to%ss)/2, 2);\n"
+			"	if(p && c == p[0])\n"
+			"		return c + p[1] - %d;\n",
+			label, label, TO_OFFSET);
+
+
+	printf(
+		"	return c;\n"
+		"}\n"
+		"\n"
+	);
+}
+
+// Make only range tables and a function for is<label>rune.
+static void
+mkisronly(const char* label, char* prop) {
+	mkisrange(label, prop, 1);
+	printf(
+		"int\n"
+		"is%srune(Rune c)\n"
+		"{\n"
+		"	Rune *p;\n"
+		"\n"
+		"	p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n"
+		"	if(p && c >= p[0] && c <= p[1])\n"
+		"		return 1;\n"
+		"	return 0;\n"
+		"}\n"
+		"\n",
+	        label, label, label);
+}
+
+/*
+ * generate the body of runetype.
+ * assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne);
+ */
+static void
+mktables(char *src, int usepairs)
+{
+	printf("/* generated automatically by mkrunetype.c from %s */\n\n",
+		basename(src));
+
+	/*
+	 * we special case the space and digit tables, since they are assumed
+	 * to be small with several ranges.
+	 */
+	mkisronly("space", myisspace);
+	mkisronly("digit", myisdigit);
+
+	mkis("alpha", myisalpha, 0);
+	mkis("upper", myisupper, usepairs);
+	mkis("lower", myislower, usepairs);
+	mkis("title", myistitle, usepairs);
+
+	mkto("upper", mytoupper, usepairs);
+	mkto("lower", mytolower, usepairs);
+	mkto("title", mytotitle, usepairs);
+}
+
+/*
+ * find differences between the newly generated tables and current runetypes.
+ */
+static void
+check(void)
+{
+	int i;
+
+	for(i = 0; i < NRUNES; i++){
+		if(isdigitrune(i) != myisdigit[i])
+			fprintf(stderr, "isdigit diff at %x: runetype=%x, unicode=%x\n",
+				i, isdigitrune(i), myisdigit[i]);
+
+		if(isspacerune(i) != myisspace[i])
+			fprintf(stderr, "isspace diff at %x: runetype=%x, unicode=%x\n",
+				i, isspacerune(i), myisspace[i]);
+
+		if(isupperrune(i) != myisupper[i])
+			fprintf(stderr, "isupper diff at %x: runetype=%x, unicode=%x\n",
+				i, isupperrune(i), myisupper[i]);
+
+		if(islowerrune(i) != myislower[i])
+			fprintf(stderr, "islower diff at %x: runetype=%x, unicode=%x\n",
+				i, islowerrune(i), myislower[i]);
+
+		if(isalpharune(i) != myisalpha[i])
+			fprintf(stderr, "isalpha diff at %x: runetype=%x, unicode=%x\n",
+				i, isalpharune(i), myisalpha[i]);
+
+		if(toupperrune(i) != mytoupper[i])
+			fprintf(stderr, "toupper diff at %x: runetype=%x, unicode=%x\n",
+				i, toupperrune(i), mytoupper[i]);
+
+		if(tolowerrune(i) != mytolower[i])
+			fprintf(stderr, "tolower diff at %x: runetype=%x, unicode=%x\n",
+				i, tolowerrune(i), mytolower[i]);
+
+		if(istitlerune(i) != myistitle[i])
+			fprintf(stderr, "istitle diff at %x: runetype=%x, unicode=%x\n",
+				i, istitlerune(i), myistitle[i]);
+
+		if(totitlerune(i) != mytotitle[i])
+			fprintf(stderr, "totitle diff at %x: runetype=%x, unicode=%x\n",
+				i, totitlerune(i), mytotitle[i]);
+
+
+	}
+}
+
+static int
+mygetfields(char **fields, int nfields, char *str, const char *delim)
+{
+	int nf;
+
+	fields[0] = str;
+	nf = 1;
+	if(nf >= nfields)
+		return nf;
+
+	for(; *str; str++){
+		if(strchr(delim, *str) != NULL){
+			*str = '\0';
+			fields[nf++] = str + 1;
+			if(nf >= nfields)
+				break;
+		}
+	}
+	return nf;
+}
+
+static int
+getunicodeline(FILE *in, char **fields, char *buf)
+{
+	char *p;
+
+	if(fgets(buf, MAX_LINE, in) == NULL)
+		return 0;
+
+	p = strchr(buf, '\n');
+	if (p == NULL)
+		fatal("line too long");
+	*p = '\0';
+
+	if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS)
+		fatal("bad number of fields");
+
+	return 1;
+}
+
+static int
+getcode(char *s)
+{
+	int i, code;
+
+	code = 0;
+        i = 0;
+        /* Parse a hex number */
+	while(s[i]) {
+		code <<= 4;
+		if(s[i] >= '0' && s[i] <= '9')
+			code += s[i] - '0';
+		else if(s[i] >= 'A' && s[i] <= 'F')
+			code += s[i] - 'A' + 10;
+		else
+			fatal("bad code char '%c'", s[i]);
+                i++;
+	}
+	return code;
+}
+
+static void
+fatal(const char *fmt, ...)
+{
+	va_list arg;
+
+	fprintf(stderr, "%s: fatal error: ", argv0);
+	va_start(arg, fmt);
+	vfprintf(stderr, fmt, arg);
+	va_end(arg);
+	fprintf(stderr, "\n");
+
+	exit(1);
+}
--- a/src/lib9/utf/rune.c
+++ b/src/lib9/utf/rune.c
@ -1,20 +1,21 @@
 /*
 * The authors of this software are Rob Pike and Ken Thompson.
 *              Copyright (c) 2002 by Lucent Technologies.
+ *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved.
 * Permission to use, copy, modify, and distribute this software for any
 * purpose without fee is hereby granted, provided that this entire notice
 * is included in all copies of any software which is or includes a copy
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

 enum
 {
@ -23,27 +24,150 @@ enum
 	Bit2	= 5,
 	Bit3	= 4,
 	Bit4	= 3,
+	Bit5	= 2,

 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */

 	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
 	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
 	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,
+                                        /* 0001 1111 1111 1111 1111 1111 */

 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */

-	Bad	= Runeerror
+	Bad	= Runeerror,
 };

+/*
+ * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
+ * This is a slower but "safe" version of the old chartorune
+ * that works on strings that are not necessarily null-terminated.
+ *
+ * If you know for sure that your string is null-terminated,
+ * chartorune will be a bit faster.
+ *
+ * It is guaranteed not to attempt to access "length"
+ * past the incoming pointer.  This is to avoid
+ * possible access violations.  If the string appears to be
+ * well-formed but incomplete (i.e., to get the whole Rune
+ * we'd need to read past str+length) then we'll set the Rune
+ * to Bad and return 0.
+ *
+ * Note that if we have decoding problems for other
+ * reasons, we return 1 instead of 0.
+ */
 int
-chartorune(Rune *rune, char *str)
+charntorune(Rune *rune, const char *str, int length)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
+	long l;
+
+	/* When we're not allowed to read anything */
+	if(length <= 0) {
+		goto badlen;
+	}
+
+	/*
+	 * one character sequence (7-bit value)
+	 *	00000-0007F => T1
+	 */
+	c = *(uchar*)str;
+	if(c < Tx) {
+		*rune = c;
+		return 1;
+	}
+
+	// If we can't read more than one character we must stop
+	if(length <= 1) {
+		goto badlen;
+	}
+
+	/*
+	 * two character sequence (11-bit value)
+	 *	0080-07FF => T2 Tx
+	 */
+	c1 = *(uchar*)(str+1) ^ Tx;
+	if(c1 & Testx)
+		goto bad;
+	if(c < T3) {
+		if(c < T2)
+			goto bad;
+		l = ((c << Bitx) | c1) & Rune2;
+		if(l <= Rune1)
+			goto bad;
+		*rune = l;
+		return 2;
+	}
+
+	// If we can't read more than two characters we must stop
+	if(length <= 2) {
+		goto badlen;
+	}
+
+	/*
+	 * three character sequence (16-bit value)
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	c2 = *(uchar*)(str+2) ^ Tx;
+	if(c2 & Testx)
+		goto bad;
+	if(c < T4) {
+		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+		if(l <= Rune2)
+			goto bad;
+		*rune = l;
+		return 3;
+	}
+
+	if (length <= 3)
+		goto badlen;
+
+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(uchar*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	// Support for 5-byte or longer UTF-8 would go here, but
+	// since we don't have that, we'll just fall through to bad.
+
+	/*
+	 * bad decoding
+	 */
+bad:
+	*rune = Bad;
+	return 1;
+badlen:
+	*rune = Bad;
+	return 0;
+
+}
+
+
+/*
+ * This is the older "unsafe" version, which works fine on
+ * null-terminated strings.
+ */
+int
+chartorune(Rune *rune, const char *str)
+{
+	int c, c1, c2, c3;
 	long l;

 	/*
@ -88,6 +212,26 @@ chartorune(Rune *rune, char *str)
 		return 3;
 	}

+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(uchar*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	/*
+	 * Support for 5-byte or longer UTF-8 would go here, but
+	 * since we don't have that, we'll just fall through to bad.
+	 */
+
 	/*
 	 * bad decoding
 	 */
@ -97,9 +241,16 @@ bad:
 }

 int
-runetochar(char *str, Rune *rune)
+isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
+	*consumed = charntorune(rune, str, length);
+	return *rune != Runeerror || *consumed == 3;
+}
+
+int
+runetochar(char *str, const Rune *rune)
 {
-	long c;
+	/* Runes are signed, so convert to unsigned for range check. */
+	unsigned long c;

 	/*
 	 * one character sequence
@ -121,57 +272,80 @@ runetochar(char *str, Rune *rune)
 		return 2;
 	}

+	/*
+	 * If the Rune is out of range, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
+	 */
+	if (c > Runemax)
+		c = Runeerror;
+
 	/*
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
 	 */
-	str[0] = T3 |  (c >> 2*Bitx);
-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-	str[2] = Tx |  (c & Maskx);
-	return 3;
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 | (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx | (c & Maskx);
+	return 4;
 }

 int
-runelen(long c)
+runelen(Rune rune)
 {
-	Rune rune;
 	char str[10];

-	rune = c;
 	return runetochar(str, &rune);
 }

 int
-runenlen(Rune *r, int nrune)
+runenlen(const Rune *r, int nrune)
 {
 	int nb, c;

 	nb = 0;
 	while(nrune--) {
 		c = *r++;
-		if(c <= Rune1)
+		if (c <= Rune1)
 			nb++;
-		else
-		if(c <= Rune2)
+		else if (c <= Rune2)
 			nb += 2;
-		else
+		else if (c <= Rune3)
 			nb += 3;
+		else /* assert(c <= Rune4) */
+			nb += 4;
 	}
 	return nb;
 }

 int
-fullrune(char *str, int n)
+fullrune(const char *str, int n)
 {
-	int c;
-
-	if(n > 0) {
-		c = *(uchar*)str;
-		if(c < Tx)
+	if (n > 0) {
+		int c = *(uchar*)str;
+		if (c < Tx)
 			return 1;
-		if(n > 1)
-			if(c < T3 || n > 2)
+		if (n > 1) {
+			if (c < T3)
 				return 1;
+			if (n > 2) {
+				if (c < T4 || n > 3)
+					return 1;
+			}
+		}
 	}
 	return 0;
 }
--- a/src/lib9/utf/runetype.c
+++ b/src/lib9/utf/runetype.c
--- a/src/lib9/utf/utf.h
+++ b/src/lib9/utf/utf.h
@ -0,0 +1,248 @@
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 1998-2002 by Lucent Technologies.
+ *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+
+#ifndef _UTFH_
+#define _UTFH_ 1
+
+#include <stdint.h>
+
+typedef signed int Rune;	/* Code-point values in Unicode 4.0 are 21 bits wide.*/
+
+enum
+{
+  UTFmax	= 4,		/* maximum bytes per rune */
+  Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */
+  Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
+  Runeerror	= 0xFFFD,	/* decoding error in UTF */
+  Runemax	= 0x10FFFF,	/* maximum rune value */
+};
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * rune routines
+ */
+
+/*
+ * These routines were written by Rob Pike and Ken Thompson
+ * and first appeared in Plan 9.
+ * SEE ALSO
+ * utf (7)
+ * tcs (1)
+*/
+
+// runetochar copies (encodes) one rune, pointed to by r, to at most
+// UTFmax bytes starting at s and returns the number of bytes generated.
+
+int runetochar(char* s, const Rune* r);
+
+
+// chartorune copies (decodes) at most UTFmax bytes starting at s to
+// one rune, pointed to by r, and returns the number of bytes consumed.
+// If the input is not exactly in UTF format, chartorune will set *r
+// to Runeerror and return 1.
+//
+// Note: There is no special case for a "null-terminated" string. A
+// string whose first byte has the value 0 is the UTF8 encoding of the
+// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
+// anywhere else in a UTF sequence.
+
+int chartorune(Rune* r, const char* s);
+
+
+// charntorune is like chartorune, except that it will access at most
+// n bytes of s.  If the UTF sequence is incomplete within n bytes,
+// charntorune will set *r to Runeerror and return 0. If it is complete
+// but not in UTF format, it will set *r to Runeerror and return 1.
+// 
+// Added 2004-09-24 by Wei-Hwa Huang
+
+int charntorune(Rune* r, const char* s, int n);
+
+// isvalidcharntorune(str, n, r, consumed)
+// is a convenience function that calls "*consumed = charntorune(r, str, n)"
+// and returns an int (logically boolean) indicating whether the first
+// n bytes of str was a valid and complete UTF sequence.
+
+int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
+
+// runelen returns the number of bytes required to convert r into UTF.
+
+int runelen(Rune r);
+
+
+// runenlen returns the number of bytes required to convert the n
+// runes pointed to by r into UTF.
+
+int runenlen(const Rune* r, int n);
+
+
+// fullrune returns 1 if the string s of length n is long enough to be
+// decoded by chartorune, and 0 otherwise. This does not guarantee
+// that the string contains a legal UTF encoding. This routine is used
+// by programs that obtain input one byte at a time and need to know
+// when a full rune has arrived.
+
+int fullrune(const char* s, int n);
+
+// The following routines are analogous to the corresponding string
+// routines with "utf" substituted for "str", and "rune" substituted
+// for "chr".
+
+// utflen returns the number of runes that are represented by the UTF
+// string s. (cf. strlen)
+
+int utflen(const char* s);
+
+
+// utfnlen returns the number of complete runes that are represented
+// by the first n bytes of the UTF string s. If the last few bytes of
+// the string contain an incompletely coded rune, utfnlen will not
+// count them; in this way, it differs from utflen, which includes
+// every byte of the string. (cf. strnlen)
+
+int utfnlen(const char* s, long n);
+
+
+// utfrune returns a pointer to the first occurrence of rune r in the
+// UTF string s, or 0 if r does not occur in the string.  The NULL
+// byte terminating a string is considered to be part of the string s.
+// (cf. strchr)
+
+const char* utfrune(const char* s, Rune r);
+
+
+// utfrrune returns a pointer to the last occurrence of rune r in the
+// UTF string s, or 0 if r does not occur in the string.  The NULL
+// byte terminating a string is considered to be part of the string s.
+// (cf. strrchr)
+
+const char* utfrrune(const char* s, Rune r);
+
+
+// utfutf returns a pointer to the first occurrence of the UTF string
+// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
+// null string, utfutf returns s1. (cf. strstr)
+
+const char* utfutf(const char* s1, const char* s2);
+
+
+// utfecpy copies UTF sequences until a null sequence has been copied,
+// but writes no sequences beyond es1.  If any sequences are copied,
+// s1 is terminated by a null sequence, and a pointer to that sequence
+// is returned.  Otherwise, the original s1 is returned. (cf. strecpy)
+
+char* utfecpy(char *s1, char *es1, const char *s2);
+
+
+
+// These functions are rune-string analogues of the corresponding
+// functions in strcat (3).
+// 
+// These routines first appeared in Plan 9.
+// SEE ALSO
+// memmove (3)
+// rune (3)
+// strcat (2)
+//
+// BUGS: The outcome of overlapping moves varies among implementations.
+
+Rune* runestrcat(Rune* s1, const Rune* s2);
+Rune* runestrncat(Rune* s1, const Rune* s2, long n);
+
+const Rune* runestrchr(const Rune* s, Rune c);
+
+int runestrcmp(const Rune* s1, const Rune* s2);
+int runestrncmp(const Rune* s1, const Rune* s2, long n);
+
+Rune* runestrcpy(Rune* s1, const Rune* s2);
+Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
+Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
+
+Rune* runestrdup(const Rune* s);
+
+const Rune* runestrrchr(const Rune* s, Rune c);
+long runestrlen(const Rune* s);
+const Rune* runestrstr(const Rune* s1, const Rune* s2);
+
+
+
+// The following routines test types and modify cases for Unicode
+// characters.  Unicode defines some characters as letters and
+// specifies three cases: upper, lower, and title.  Mappings among the
+// cases are also defined, although they are not exhaustive: some
+// upper case letters have no lower case mapping, and so on.  Unicode
+// also defines several character properties, a subset of which are
+// checked by these routines.  These routines are based on Unicode
+// version 3.0.0.
+//
+// NOTE: The routines are implemented in C, so the boolean functions
+// (e.g., isupperrune) return 0 for false and 1 for true.
+//
+//
+// toupperrune, tolowerrune, and totitlerune are the Unicode case
+// mappings. These routines return the character unchanged if it has
+// no defined mapping.
+
+Rune toupperrune(Rune r);
+Rune tolowerrune(Rune r);
+Rune totitlerune(Rune r);
+
+
+// isupperrune tests for upper case characters, including Unicode
+// upper case letters and targets of the toupper mapping. islowerrune
+// and istitlerune are defined analogously. 
+ 
+int isupperrune(Rune r);
+int islowerrune(Rune r);
+int istitlerune(Rune r);
+
+
+// isalpharune tests for Unicode letters; this includes ideographs in
+// addition to alphabetic characters.
+
+int isalpharune(Rune r);
+
+
+// isdigitrune tests for digits. Non-digit numbers, such as Roman
+// numerals, are not included.
+
+int isdigitrune(Rune r);
+
+
+// isideographicrune tests for ideographic characters and numbers, as
+// defined by the Unicode standard.
+
+int isideographicrune(Rune r);
+
+
+// isspacerune tests for whitespace characters, including "C" locale
+// whitespace, Unicode defined whitespace, and the "zero-width
+// non-break space" character.
+
+int isspacerune(Rune r);
+
+
+// (The comments in this file were copied from the manpage files rune.3,
+// isalpharune.3, and runestrcat.3. Some formatting changes were also made
+// to conform to Google style. /JRM 11/11/05)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
--- a/src/lib9/utf/utfdef.h
+++ b/src/lib9/utf/utfdef.h
@ -12,36 +12,17 @@
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */

-/*
- * compiler directive on Plan 9
- */
-#ifndef USED
-#define USED(x) if(x);else
-#endif
+#define uchar _utfuchar
+#define ushort _utfushort
+#define uint _utfuint
+#define ulong _utfulong
+#define vlong _utfvlong
+#define uvlong _utfuvlong

-/*
- * easiest way to make sure these are defined
- */
-#define uchar	_fmtuchar
-#define ushort	_fmtushort
-#define uint	_fmtuint
-#define ulong	_fmtulong
-#define vlong	_fmtvlong
-#define uvlong	_fmtuvlong
 typedef unsigned char		uchar;
 typedef unsigned short		ushort;
 typedef unsigned int		uint;
 typedef unsigned long		ulong;
-typedef unsigned long long	uvlong;
-typedef long long		vlong;
-
-/*
- * nil cannot be ((void*)0) on ANSI C,
- * because it is used for function pointers
- */
-#undef	nil
-#define	nil	0
-
-#undef	nelem
-#define	nelem	((void*)0)

+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+#define nil ((void*)0)
--- a/src/lib9/utf/utfecpy.c
+++ b/src/lib9/utf/utfecpy.c
@ -7,18 +7,17 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
-#define _BSD_SOURCE 1	/* memccpy */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

 char*
-utfecpy(char *to, char *e, char *from)
+utfecpy(char *to, char *e, const char *from)
 {
 	char *end;

--- a/src/lib9/utf/utflen.c
+++ b/src/lib9/utf/utflen.c
@ -7,17 +7,17 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

 int
-utflen(char *s)
+utflen(const char *s)
 {
 	int c;
 	long n;
@ -34,4 +34,5 @@ utflen(char *s)
 			s += chartorune(&rune, s);
 		n++;
 	}
+	return 0;
 }
--- a/src/lib9/utf/utfnlen.c
+++ b/src/lib9/utf/utfnlen.c
@ -7,22 +7,22 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

 int
-utfnlen(char *s, long m)
+utfnlen(const char *s, long m)
 {
 	int c;
 	long n;
 	Rune rune;
-	char *es;
+	const char *es;

 	es = s + m;
 	for(n = 0; s < es; n++) {
--- a/src/lib9/utf/utfrrune.c
+++ b/src/lib9/utf/utfrrune.c
@ -7,21 +7,22 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

+const
 char*
-utfrrune(char *s, long c)
+utfrrune(const char *s, Rune c)
 {
 	long c1;
 	Rune r;
-	char *s1;
+	const char *s1;

 	if(c < Runesync)		/* not part of utf sequence */
 		return strrchr(s, c);
@ -42,4 +43,5 @@ utfrrune(char *s, long c)
 			s1 = s;
 		s += c1;
 	}
+	return 0;
 }
--- a/src/lib9/utf/utfrune.c
+++ b/src/lib9/utf/utfrune.c
@ -7,17 +7,18 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"

+const
 char*
-utfrune(char *s, long c)
+utfrune(const char *s, Rune c)
 {
 	long c1;
 	Rune r;
@ -41,4 +42,5 @@ utfrune(char *s, long c)
 			return s;
 		s += n;
 	}
+	return 0;
 }
--- a/src/lib9/utf/utfutf.c
+++ b/src/lib9/utf/utfutf.c
@ -7,24 +7,25 @@
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
- * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
-#include "plan9.h"
 #include "utf.h"
+#include "utfdef.h"


 /*
 * Return pointer to first occurrence of s2 in s1,
 * 0 if none
 */
+const
 char*
-utfutf(char *s1, char *s2)
+utfutf(const char *s1, const char *s2)
 {
-	char *p;
+	const char *p;
 	long f, n1, n2;
 	Rune r;

@ -34,7 +35,7 @@ utfutf(char *s1, char *s2)
 		return strstr(s1, s2);

 	n2 = strlen(s2);
-	for(p=s1; p=utfrune(p, f); p+=n1)
+	for(p=s1; (p=utfrune(p, f)) != 0; p+=n1)
 		if(strncmp(p, s2, n2) == 0)
 			return p;
 	return 0;
--- a/src/runtime/Makefile
+++ b/src/runtime/Makefile
@ -20,6 +20,7 @@ LIBOFILES=\
 	runtime.$O\
 	map.$O\
 	print.$O\
+	rune.$O\
 	string.$O\
 	sys_file.$O\

--- a/src/runtime/rune.c
+++ b/src/runtime/rune.c
@ -0,0 +1,224 @@
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+
+/*
+ * This code is copied, with slight editing due to type differences,
+ * from a subset of ../lib9/utf/rune.c
+ */
+
+#include "runtime.h"
+
+enum
+{
+	Bit1	= 7,
+	Bitx	= 6,
+	Bit2	= 5,
+	Bit3	= 4,
+	Bit4	= 3,
+	Bit5	= 2, 
+
+	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
+	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
+	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
+	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
+	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
+
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,
+                                        /* 0001 1111 1111 1111 1111 1111 */
+
+	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
+	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
+
+	Runeerror	= 0xFFFD,
+	Runeself	= 0x80,
+
+	Bad	= Runeerror,
+	
+	Runemax	= 0x10FFFF,	/* maximum rune value */
+};
+
+/*
+ * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
+ * This is a slower but "safe" version of the old chartorune 
+ * that works on strings that are not necessarily null-terminated.
+ * 
+ * If you know for sure that your string is null-terminated,
+ * chartorune will be a bit faster.
+ *
+ * It is guaranteed not to attempt to access "length"
+ * past the incoming pointer.  This is to avoid
+ * possible access violations.  If the string appears to be
+ * well-formed but incomplete (i.e., to get the whole Rune
+ * we'd need to read past str+length) then we'll set the Rune
+ * to Bad and return 0.
+ *
+ * Note that if we have decoding problems for other
+ * reasons, we return 1 instead of 0.
+ */
+int32
+charntorune(int32 *rune, byte *str, int32 length)
+{
+	int32 c, c1, c2, c3;
+	int32 l;
+
+	/* When we're not allowed to read anything */
+	if(length <= 0) {
+		goto badlen;
+	}
+
+	/*
+	 * one character sequence (7-bit value)
+	 *	00000-0007F => T1
+	 */
+	c = *(byte*)str;  /* cast not necessary, but kept for safety */
+	if(c < Tx) {
+		*rune = c;
+		return 1;
+	}
+
+	// If we can't read more than one character we must stop
+	if(length <= 1) {
+		goto badlen;
+	}
+
+	/*
+	 * two character sequence (11-bit value)
+	 *	0080-07FF => T2 Tx
+	 */
+	c1 = *(byte*)(str+1) ^ Tx;
+	if(c1 & Testx)
+		goto bad;
+	if(c < T3) {
+		if(c < T2)
+			goto bad;
+		l = ((c << Bitx) | c1) & Rune2;
+		if(l <= Rune1)
+			goto bad;
+		*rune = l;
+		return 2;
+	}
+
+	// If we can't read more than two characters we must stop
+	if(length <= 2) {
+		goto badlen;
+	}
+
+	/*
+	 * three character sequence (16-bit value)
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	c2 = *(byte*)(str+2) ^ Tx;
+	if(c2 & Testx)
+		goto bad;
+	if(c < T4) {
+		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+		if(l <= Rune2)
+			goto bad;
+		*rune = l;
+		return 3;
+	}
+
+	if (length <= 3)
+		goto badlen;
+
+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(byte*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	// Support for 5-byte or longer UTF-8 would go here, but
+	// since we don't have that, we'll just fall through to bad.
+
+	/*
+	 * bad decoding
+	 */
+bad:
+	*rune = Bad;
+	return 1;
+badlen:
+	*rune = Bad;
+	return 0;
+
+}
+
+int32
+runetochar(byte *str, int32 rune)  /* note: in original, arg2 was pointer */
+{
+	/* Runes are signed, so convert to unsigned for range check. */
+	uint32 c;
+
+	/*
+	 * one character sequence
+	 *	00000-0007F => 00-7F
+	 */
+	c = rune;
+	if(c <= Rune1) {
+		str[0] = c;
+		return 1;
+	}
+
+	/*
+	 * two character sequence
+	 *	0080-07FF => T2 Tx
+	 */
+	if(c <= Rune2) {
+		str[0] = T2 | (c >> 1*Bitx);
+		str[1] = Tx | (c & Maskx);
+		return 2;
+	}
+
+	/*
+	 * If the Rune is out of range, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
+	 */
+	if (c > Runemax)
+		c = Runeerror;
+
+	/*
+	 * three character sequence
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 | (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx | (c & Maskx);
+	return 4;
+}
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@ -85,6 +85,8 @@ enum
 int32 strcmp(byte*, byte*);
 int32 findnull(int8*);
 void	dump(byte*, int32);
+int32 runetochar(byte*, int32);
+int32 chartorune(uint32*, byte*);

 extern string	emptystring;
 extern int32 debug;
--- a/src/runtime/string.c
+++ b/src/runtime/string.c
@ -151,55 +151,6 @@ sys·indexstring(string s, int32 i, byte b)
 	FLUSH(&b);
 }

-/*
- * this is the plan9 runetochar
- * extended for 36 bits in 7 bytes
- * note that it truncates to 32 bits
- * through the argument passing.
- */
-static int32
-runetochar(byte *str, uint32 c)
-{
-	int32 i, n;
-	uint32 mask, mark;
-
-	/*
-	 * one character in 7 bits
-	 */
-	if(c <= 0x07FUL) {
-		str[0] = c;
-		return 1;
-	}
-
-	/*
-	 * every new character picks up 5 bits
-	 * one less in the first byte and
-	 * six more in an extension byte
-	 */
-	mask = 0x7ffUL;
-	mark = 0xC0UL;
-	for(n=1;; n++) {
-		if(c <= mask)
-			break;
-		mask = (mask<<5) | 0x1fUL;
-		mark = (mark>>1) | 0x80UL;
-	}
-
-	/*
-	 * lay down the bytes backwards
-	 * n is the number of extension bytes
-	 * mask is the max codepoint
-	 * mark is the zeroth byte indicator
-	 */
-	for(i=n; i>0; i--) {
-		str[i] = 0x80UL | (c&0x3fUL);
-		c >>= 6;
-	}
-
-	str[0] = mark|c;
-	return n+1;
-}
-
 void
 sys·intstring(int64 v, string s)
 {
--- a/test/string_lit.go
+++ b/test/string_lit.go
@ -75,5 +75,14 @@ func main() {
 	       `\000\123\x00\312\xFE\u0123\ubabe\U0000babe`,
           "backslashes 2 (backquote)");
 	assert("\\x\\u\\U\\", `\x\u\U\`, "backslash 3 (backquote)");
+
+	// test large runes. perhaps not the most logical place for this test.
+	var r int32;
+	r = 0x10ffff;	// largest rune value
+	s = string(r);
+	assert(s, "\xf4\x8f\xbf\xbf", "largest rune");
+	r = 0x10ffff + 1;
+	s = string(r);
+	assert(s, "\xef\xbf\xbd", "too-large rune");
 	sys.exit(ecode);
 }