96 lines
1.9 KiB
Go
96 lines
1.9 KiB
Go
|
package heff
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"fmt"
|
||
|
"io"
|
||
|
"math/rand"
|
||
|
"strings"
|
||
|
"unicode"
|
||
|
"unicode/utf8"
|
||
|
)
|
||
|
|
||
|
// ScanHTML is a basic split function for a Scanner that returns each
|
||
|
// space-separated word of text or HTML tag, with surrounding spaces deleted.
|
||
|
// It will never return an empty string. The definition of space is set by
|
||
|
// unicode.IsSpace.
|
||
|
func ScanHTML(data []byte, atEOF bool) (advance int, token []byte, err error) {
|
||
|
// Skip leading spaces.
|
||
|
var r rune
|
||
|
start := 0
|
||
|
for width := 0; start < len(data); start += width {
|
||
|
r, width = utf8.DecodeRune(data[start:])
|
||
|
if !unicode.IsSpace(r) {
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
if r == '<' {
|
||
|
// Scan until closing bracket
|
||
|
for i := start; i < len(data); i++ {
|
||
|
if data[i] == '>' {
|
||
|
return i + 1, data[start : i+1], nil
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
// Scan until space, marking end of word.
|
||
|
for width, i := 0, start; i < len(data); i += width {
|
||
|
var r rune
|
||
|
r, width = utf8.DecodeRune(data[i:])
|
||
|
if unicode.IsSpace(r) {
|
||
|
return i + width, data[start:i], nil
|
||
|
}
|
||
|
if r == '<' {
|
||
|
return i, data[start:i], nil
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
|
||
|
if atEOF && len(data) > start {
|
||
|
return len(data), data[start:], nil
|
||
|
}
|
||
|
// Request more data.
|
||
|
return start, nil, nil
|
||
|
}
|
||
|
|
||
|
const (
|
||
|
nprefix = 2
|
||
|
nonword = "\n"
|
||
|
)
|
||
|
|
||
|
type tokenPair [nprefix]string
|
||
|
|
||
|
var markov = make(map[tokenPair][]string)
|
||
|
|
||
|
func init() {
|
||
|
var w1, w2 = nonword, nonword
|
||
|
var p tokenPair
|
||
|
|
||
|
s := bufio.NewScanner(strings.NewReader(Src))
|
||
|
s.Split(ScanHTML)
|
||
|
for s.Scan() {
|
||
|
t := s.Text()
|
||
|
p = tokenPair{w1, w2}
|
||
|
markov[p] = append(markov[p], t)
|
||
|
w1, w2 = w2, t
|
||
|
}
|
||
|
|
||
|
p = tokenPair{w1, w2}
|
||
|
markov[p] = append(markov[p], nonword)
|
||
|
}
|
||
|
|
||
|
func Generate(w io.Writer, maxgen int) {
|
||
|
var w1, w2 = nonword, nonword
|
||
|
|
||
|
for i := 0; i < maxgen; i++ {
|
||
|
p := tokenPair{w1, w2}
|
||
|
suffix := markov[p]
|
||
|
r := rand.Intn(len(suffix))
|
||
|
t := suffix[r]
|
||
|
if t == nonword {
|
||
|
break
|
||
|
}
|
||
|
fmt.Fprint(w, t, "\n")
|
||
|
w1, w2 = w2, t
|
||
|
}
|
||
|
}
|