1
0
mirror of https://github.com/golang/go synced 2024-11-25 05:57:57 -07:00

improved sentence extraction:

- don't forget first periods
- look at capitalization of last char before periods

R=rsc
http://go/go-review/1024027
This commit is contained in:
Robert Griesemer 2009-11-08 21:48:51 -08:00
parent ed6eb5b577
commit c532940ecf

View File

@ -23,6 +23,7 @@ import (
"sync";
"template";
"time";
"unicode";
"utf8";
)
@ -137,21 +138,38 @@ func htmlEscape(s string) string {
func firstSentence(s string) string {
// find first period followed by whitespace, or just the first period
i := -1;
for j, ch := range s {
i := -1; // index+1 of first period
j := -1; // index+1 of first period that is followed by white space
prev := 'A';
for k, ch := range s {
k1 := k+1;
if ch == '.' {
i = j+1; // include period
if i < len(s) && s[i] <= ' ' {
break;
if i < 0 {
i = k1; // first period
}
if k1 < len(s) && s[k1] <= ' ' {
if j < 0 {
j = k1; // first period followed by white space
}
if !unicode.IsUpper(prev) {
j = k1;
break;
}
}
}
prev = ch;
}
if i < 0 {
// no period found, use the enire string
i = len(s);
if j < 0 {
// use the next best period
j = i;
if j < 0 {
// no period at all, use the entire string
j = len(s);
}
}
return s[0:i];
return s[0:j];
}