go/test/bench/go1/fasta_test.go

// Copyright 2011 The Go Authors.  All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package go1

import "runtime"

// Not a benchmark; input for revcomp.

var fastabytes = makefasta()

func makefasta() []byte {
	var n int = 25e6
	if runtime.GOARCH == "arm" {
		// TODO(dfc) remove this limitation after precise gc.
		// A value of 25e6 consumes 465mb of heap on 32bit
		// platforms, which is too much for most ARM systems.
		// A value of 25e5 produces a memory layout that
		// confuses the gc on 32bit platforms. So 25e4 it is.
		n = 25e4
	}
	return fasta(n)
}

func fasta(n int) []byte {
	out := make(fastaBuffer, 0, 11*n)

	iub := []fastaAcid{
		{prob: 0.27, sym: 'a'},
		{prob: 0.12, sym: 'c'},
		{prob: 0.12, sym: 'g'},
		{prob: 0.27, sym: 't'},
		{prob: 0.02, sym: 'B'},
		{prob: 0.02, sym: 'D'},
		{prob: 0.02, sym: 'H'},
		{prob: 0.02, sym: 'K'},
		{prob: 0.02, sym: 'M'},
		{prob: 0.02, sym: 'N'},
		{prob: 0.02, sym: 'R'},
		{prob: 0.02, sym: 'S'},
		{prob: 0.02, sym: 'V'},
		{prob: 0.02, sym: 'W'},
		{prob: 0.02, sym: 'Y'},
	}

	homosapiens := []fastaAcid{
		{prob: 0.3029549426680, sym: 'a'},
		{prob: 0.1979883004921, sym: 'c'},
		{prob: 0.1975473066391, sym: 'g'},
		{prob: 0.3015094502008, sym: 't'},
	}

	alu := []byte(
		"GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG" +
			"GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA" +
			"CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT" +
			"ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA" +
			"GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG" +
			"AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC" +
			"AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA")

	out.WriteString(">ONE Homo sapiens alu\n")
	fastaRepeat(&out, alu, 2*n)
	out.WriteString(">TWO IUB ambiguity codes\n")
	fastaRandom(&out, iub, 3*n)
	out.WriteString(">THREE Homo sapiens frequency\n")
	fastaRandom(&out, homosapiens, 5*n)
	return out
}

type fastaBuffer []byte

func (b *fastaBuffer) Flush() {
	panic("flush")
}

func (b *fastaBuffer) WriteString(s string) {
	p := b.NextWrite(len(s))
	copy(p, s)
}

func (b *fastaBuffer) NextWrite(n int) []byte {
	p := *b
	if len(p)+n > cap(p) {
		b.Flush()
		p = *b
	}
	out := p[len(p) : len(p)+n]
	*b = p[:len(p)+n]
	return out
}

const fastaLine = 60

func fastaRepeat(out *fastaBuffer, alu []byte, n int) {
	buf := append(alu, alu...)
	off := 0
	for n > 0 {
		m := n
		if m > fastaLine {
			m = fastaLine
		}
		buf1 := out.NextWrite(m + 1)
		copy(buf1, buf[off:])
		buf1[m] = '\n'
		if off += m; off >= len(alu) {
			off -= len(alu)
		}
		n -= m
	}
}

const (
	fastaLookupSize          = 4096
	fastaLookupScale float64 = fastaLookupSize - 1
)

var fastaRand uint32 = 42

type fastaAcid struct {
	sym   byte
	prob  float64
	cprob float64
	next  *fastaAcid
}

func fastaComputeLookup(acid []fastaAcid) *[fastaLookupSize]*fastaAcid {
	var lookup [fastaLookupSize]*fastaAcid
	var p float64
	for i := range acid {
		p += acid[i].prob
		acid[i].cprob = p * fastaLookupScale
		if i > 0 {
			acid[i-1].next = &acid[i]
		}
	}
	acid[len(acid)-1].cprob = 1.0 * fastaLookupScale

	j := 0
	for i := range lookup {
		for acid[j].cprob < float64(i) {
			j++
		}
		lookup[i] = &acid[j]
	}

	return &lookup
}

func fastaRandom(out *fastaBuffer, acid []fastaAcid, n int) {
	const (
		IM = 139968
		IA = 3877
		IC = 29573
	)
	lookup := fastaComputeLookup(acid)
	for n > 0 {
		m := n
		if m > fastaLine {
			m = fastaLine
		}
		buf := out.NextWrite(m + 1)
		f := fastaLookupScale / IM
		myrand := fastaRand
		for i := 0; i < m; i++ {
			myrand = (myrand*IA + IC) % IM
			r := float64(int(myrand)) * f
			a := lookup[int(r)]
			for a.cprob < r {
				a = a.next
			}
			buf[i] = a.sym
		}
		fastaRand = myrand
		buf[m] = '\n'
		n -= m
	}
}