mirror of
https://github.com/golang/go
synced 2024-11-24 15:20:03 -07:00
regex-dna
R=rsc DELTA=243 (242 added, 0 deleted, 1 changed) OCL=32786 CL=32791
This commit is contained in:
parent
6adadeb3ab
commit
d9a0bc9b58
154
test/bench/regex-dna.c
Normal file
154
test/bench/regex-dna.c
Normal file
@ -0,0 +1,154 @@
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of "The Computer Language Benchmarks Game" nor the
|
||||
name of "The Computer Language Shootout Benchmarks" nor the names of
|
||||
its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
** The Computer Language Shootout
|
||||
** http://shootout.alioth.debian.org/
|
||||
** contributed by Mike Pall
|
||||
**
|
||||
** regex-dna benchmark using PCRE
|
||||
**
|
||||
** compile with:
|
||||
** gcc -O3 -fomit-frame-pointer -o regexdna regexdna.c -lpcre
|
||||
*/
|
||||
|
||||
#define __USE_STRING_INLINES
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <pcre.h>
|
||||
|
||||
typedef struct fbuf {
|
||||
char *buf;
|
||||
size_t size, len;
|
||||
} fbuf_t;
|
||||
|
||||
static void fb_init(fbuf_t *b)
|
||||
{
|
||||
b->buf = NULL;
|
||||
b->len = b->size = 0;
|
||||
}
|
||||
|
||||
static char *fb_need(fbuf_t *b, size_t need)
|
||||
{
|
||||
need += b->len;
|
||||
if (need > b->size) {
|
||||
if (b->size == 0) b->size = need;
|
||||
else while (need > b->size) b->size += b->size;
|
||||
if (!(b->buf = realloc(b->buf, b->size))) exit(1);
|
||||
}
|
||||
return b->buf+b->len;
|
||||
}
|
||||
|
||||
#define FB_MINREAD (3<<16)
|
||||
|
||||
/* Read all of a stdio stream into dst buffer. */
|
||||
static size_t fb_readall(fbuf_t *dst, FILE *fp)
|
||||
{
|
||||
char *dp;
|
||||
int n;
|
||||
for (dp = fb_need(dst, FB_MINREAD);
|
||||
(n = fread(dp, 1, dst->size-dst->len, fp)) > 0;
|
||||
dp = fb_need(dst, FB_MINREAD)) dst->len += n;
|
||||
if (ferror(fp)) exit(1);
|
||||
return dst->len;
|
||||
}
|
||||
|
||||
/* Substitute pattern p with replacement r, copying from src to dst buffer. */
|
||||
static size_t fb_subst(fbuf_t *dst, fbuf_t *src, const char *p, const char *r)
|
||||
{
|
||||
pcre *re;
|
||||
pcre_extra *re_ex;
|
||||
const char *re_e;
|
||||
char *dp;
|
||||
int re_eo, m[3], pos, rlen, clen;
|
||||
if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1);
|
||||
re_ex = pcre_study(re, 0, &re_e);
|
||||
for (dst->len = 0, rlen = strlen(r), pos = 0;
|
||||
pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
|
||||
pos = m[1]) {
|
||||
clen = m[0]-pos;
|
||||
dp = fb_need(dst, clen+rlen);
|
||||
dst->len += clen+rlen;
|
||||
memcpy(dp, src->buf+pos, clen);
|
||||
memcpy(dp+clen, r, rlen);
|
||||
}
|
||||
clen = src->len-pos;
|
||||
dp = fb_need(dst, clen);
|
||||
dst->len += clen;
|
||||
memcpy(dp, src->buf+pos, clen);
|
||||
return dst->len;
|
||||
}
|
||||
|
||||
/* Count all matches with pattern p in src buffer. */
|
||||
static int fb_countmatches(fbuf_t *src, const char *p)
|
||||
{
|
||||
pcre *re;
|
||||
pcre_extra *re_ex;
|
||||
const char *re_e;
|
||||
int re_eo, m[3], pos, count;
|
||||
if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1);
|
||||
re_ex = pcre_study(re, 0, &re_e);
|
||||
for (count = 0, pos = 0;
|
||||
pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
|
||||
pos = m[1]) count++;
|
||||
return count;
|
||||
}
|
||||
|
||||
static const char *variants[] = {
|
||||
"agggtaaa|tttaccct", "[cgt]gggtaaa|tttaccc[acg]",
|
||||
"a[act]ggtaaa|tttacc[agt]t", "ag[act]gtaaa|tttac[agt]ct",
|
||||
"agg[act]taaa|ttta[agt]cct", "aggg[acg]aaa|ttt[cgt]ccct",
|
||||
"agggt[cgt]aa|tt[acg]accct", "agggta[cgt]a|t[acg]taccct",
|
||||
"agggtaa[cgt]|[acg]ttaccct", NULL
|
||||
};
|
||||
|
||||
static const char *subst[] = {
|
||||
"B", "(c|g|t)", "D", "(a|g|t)", "H", "(a|c|t)", "K", "(g|t)",
|
||||
"M", "(a|c)", "N", "(a|c|g|t)", "R", "(a|g)", "S", "(c|g)",
|
||||
"V", "(a|c|g)", "W", "(a|t)", "Y", "(c|t)", NULL
|
||||
};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
fbuf_t seq[2];
|
||||
const char **pp;
|
||||
size_t ilen, clen, slen;
|
||||
int flip;
|
||||
fb_init(&seq[0]);
|
||||
fb_init(&seq[1]);
|
||||
ilen = fb_readall(&seq[0], stdin);
|
||||
clen = fb_subst(&seq[1], &seq[0], ">.*|\n", "");
|
||||
for (pp = variants; *pp; pp++)
|
||||
printf("%s %d\n", *pp, fb_countmatches(&seq[1], *pp));
|
||||
for (slen = 0, flip = 1, pp = subst; *pp; pp += 2, flip = 1-flip)
|
||||
slen = fb_subst(&seq[1-flip], &seq[flip], *pp, pp[1]);
|
||||
printf("\n%zu\n%zu\n%zu\n", ilen, clen, slen);
|
||||
return 0;
|
||||
}
|
117
test/bench/regex-dna.go
Normal file
117
test/bench/regex-dna.go
Normal file
@ -0,0 +1,117 @@
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of "The Computer Language Benchmarks Game" nor the
|
||||
name of "The Computer Language Shootout Benchmarks" nor the names of
|
||||
its contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/* The Computer Language Benchmarks Game
|
||||
* http://shootout.alioth.debian.org/
|
||||
*
|
||||
* contributed by The Go Authors.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt";
|
||||
"io";
|
||||
"os";
|
||||
"regexp";
|
||||
)
|
||||
|
||||
func compile(s string) *regexp.Regexp {
|
||||
r, err := regexp.Compile(s);
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "can't compile pattern %q: %s\n", s, err);
|
||||
os.Exit(2);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
var variants = []string {
|
||||
"agggtaaa|tttaccct",
|
||||
"[cgt]gggtaaa|tttaccc[acg]",
|
||||
"a[act]ggtaaa|tttacc[agt]t",
|
||||
"ag[act]gtaaa|tttac[agt]ct",
|
||||
"agg[act]taaa|ttta[agt]cct",
|
||||
"aggg[acg]aaa|ttt[cgt]ccct",
|
||||
"agggt[cgt]aa|tt[acg]accct",
|
||||
"agggta[cgt]a|t[acg]taccct",
|
||||
"agggtaa[cgt]|[acg]ttaccct",
|
||||
}
|
||||
|
||||
type Subst struct {
|
||||
pat, repl string
|
||||
}
|
||||
|
||||
var substs = [] Subst {
|
||||
Subst {"B", "(c|g|t)"},
|
||||
Subst {"D", "(a|g|t)"},
|
||||
Subst {"H", "(a|c|t)"},
|
||||
Subst {"K", "(g|t)"},
|
||||
Subst {"M", "(a|c)"},
|
||||
Subst {"N", "(a|c|g|t)"},
|
||||
Subst {"R", "(a|g)"},
|
||||
Subst {"S", "(c|g)"},
|
||||
Subst {"V", "(a|c|g)"},
|
||||
Subst {"W", "(a|t)"},
|
||||
Subst {"Y", "(c|t)"},
|
||||
}
|
||||
|
||||
func countMatches(pat, str string) int {
|
||||
re := compile(pat);
|
||||
n := 0;
|
||||
pos := 0;
|
||||
for {
|
||||
e := re.Execute(str);
|
||||
if len(e) == 0 {
|
||||
break;
|
||||
}
|
||||
n++;
|
||||
str = str[e[1]:len(str)];
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
func main() {
|
||||
bytes, err := io.ReadFile("/dev/stdin");
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "can't read input: %s\n", err);
|
||||
os.Exit(2);
|
||||
}
|
||||
str := string(bytes);
|
||||
ilen := len(str);
|
||||
// Delete the comment lines and newlines
|
||||
str = compile("(>[^\n]+)?\n").ReplaceAll(str, "");
|
||||
clen := len(str);
|
||||
for i, s := range variants {
|
||||
fmt.Printf("%s %d\n", s, countMatches(s, str));
|
||||
}
|
||||
for i, sub := range substs {
|
||||
str = compile(sub.pat).ReplaceAll(str, sub.repl);
|
||||
}
|
||||
fmt.Printf("\n%d\n%d\n%d\n", ilen, clen, len(str));
|
||||
}
|
13
test/bench/regex-dna.txt
Normal file
13
test/bench/regex-dna.txt
Normal file
@ -0,0 +1,13 @@
|
||||
agggtaaa|tttaccct 0
|
||||
[cgt]gggtaaa|tttaccc[acg] 3
|
||||
a[act]ggtaaa|tttacc[agt]t 9
|
||||
ag[act]gtaaa|tttac[agt]ct 8
|
||||
agg[act]taaa|ttta[agt]cct 10
|
||||
aggg[acg]aaa|ttt[cgt]ccct 3
|
||||
agggt[cgt]aa|tt[acg]accct 4
|
||||
agggta[cgt]a|t[acg]taccct 3
|
||||
agggtaa[cgt]|[acg]ttaccct 5
|
||||
|
||||
101745
|
||||
100000
|
||||
133640
|
@ -53,3 +53,8 @@ fannkuch 12
|
||||
gccgo -O2 fannkuch.go 64.89u 0.00s 64.92r
|
||||
gc fannkuch 124.59u 0.00s 124.67r
|
||||
gc_B fannkuch 91.14u 0.00s 91.16r
|
||||
|
||||
regex-dna 100000
|
||||
gcc -O2 regex-dna.c -lpcre 0.92u 0.00s 0.99r
|
||||
gc regexp-dna 26.94u 0.18s 28.75r
|
||||
gc_B regexp-dna 26.51u 0.09s 26.75r
|
||||
|
@ -65,9 +65,20 @@ fannkuch() {
|
||||
run 'gc_B fannkuch' $O.out -n 12
|
||||
}
|
||||
|
||||
regexdna() {
|
||||
gcc -O2 fasta.c
|
||||
a.out 100000 > x
|
||||
echo 'regex-dna 100000'
|
||||
run 'gcc -O2 regex-dna.c -lpcre' a.out <x
|
||||
# run 'gccgo -O2 regex-dna.go' a.out <x # pages badly; don't run
|
||||
run 'gc regex-dna' $O.out <x
|
||||
run 'gc_B regex-dna' $O.out <x
|
||||
rm x
|
||||
}
|
||||
|
||||
case $# in
|
||||
0)
|
||||
run="fasta revcom nbody binarytree fannkuch"
|
||||
run="fasta revcom nbody binarytree fannkuch regexdna"
|
||||
;;
|
||||
*)
|
||||
run=$*
|
||||
|
Loading…
Reference in New Issue
Block a user