mirror of
https://github.com/golang/go
synced 2024-11-25 10:57:58 -07:00
improved sentence extraction:
- don't forget first periods - look at capitalization of last char before periods R=rsc http://go/go-review/1024027
This commit is contained in:
parent
ed6eb5b577
commit
c532940ecf
@ -23,6 +23,7 @@ import (
|
|||||||
"sync";
|
"sync";
|
||||||
"template";
|
"template";
|
||||||
"time";
|
"time";
|
||||||
|
"unicode";
|
||||||
"utf8";
|
"utf8";
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -137,21 +138,38 @@ func htmlEscape(s string) string {
|
|||||||
|
|
||||||
|
|
||||||
func firstSentence(s string) string {
|
func firstSentence(s string) string {
|
||||||
// find first period followed by whitespace, or just the first period
|
i := -1; // index+1 of first period
|
||||||
i := -1;
|
j := -1; // index+1 of first period that is followed by white space
|
||||||
for j, ch := range s {
|
prev := 'A';
|
||||||
|
for k, ch := range s {
|
||||||
|
k1 := k+1;
|
||||||
if ch == '.' {
|
if ch == '.' {
|
||||||
i = j+1; // include period
|
if i < 0 {
|
||||||
if i < len(s) && s[i] <= ' ' {
|
i = k1; // first period
|
||||||
|
}
|
||||||
|
if k1 < len(s) && s[k1] <= ' ' {
|
||||||
|
if j < 0 {
|
||||||
|
j = k1; // first period followed by white space
|
||||||
|
}
|
||||||
|
if !unicode.IsUpper(prev) {
|
||||||
|
j = k1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if i < 0 {
|
prev = ch;
|
||||||
// no period found, use the enire string
|
|
||||||
i = len(s);
|
|
||||||
}
|
}
|
||||||
return s[0:i];
|
|
||||||
|
if j < 0 {
|
||||||
|
// use the next best period
|
||||||
|
j = i;
|
||||||
|
if j < 0 {
|
||||||
|
// no period at all, use the entire string
|
||||||
|
j = len(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return s[0:j];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user