2021-09-15 13:43:01 +00:00
|
|
|
package heffalump
|
2016-12-11 01:38:18 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"io"
|
|
|
|
"math/rand"
|
|
|
|
"strings"
|
|
|
|
"unicode"
|
|
|
|
"unicode/utf8"
|
2022-07-26 05:46:04 +00:00
|
|
|
|
|
|
|
"git.tcp.direct/kayos/common/squish"
|
2016-12-11 01:38:18 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// ScanHTML is a basic split function for a Scanner that returns each
|
|
|
|
// space-separated word of text or HTML tag, with surrounding spaces deleted.
|
|
|
|
// It will never return an empty string. The definition of space is set by
|
|
|
|
// unicode.IsSpace.
|
|
|
|
func ScanHTML(data []byte, atEOF bool) (advance int, token []byte, err error) {
|
|
|
|
// Skip leading spaces.
|
|
|
|
var r rune
|
2021-09-15 08:28:09 +00:00
|
|
|
var start = 0
|
2016-12-11 01:38:18 +00:00
|
|
|
for width := 0; start < len(data); start += width {
|
|
|
|
r, width = utf8.DecodeRune(data[start:])
|
|
|
|
if !unicode.IsSpace(r) {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if r == '<' {
|
|
|
|
// Scan until closing bracket
|
|
|
|
for i := start; i < len(data); i++ {
|
|
|
|
if data[i] == '>' {
|
|
|
|
return i + 1, data[start : i+1], nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Scan until space, marking end of word.
|
|
|
|
for width, i := 0, start; i < len(data); i += width {
|
|
|
|
var r rune
|
|
|
|
r, width = utf8.DecodeRune(data[i:])
|
|
|
|
if unicode.IsSpace(r) {
|
|
|
|
return i + width, data[start:i], nil
|
|
|
|
}
|
|
|
|
if r == '<' {
|
|
|
|
return i, data[start:i], nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
|
|
|
|
if atEOF && len(data) > start {
|
|
|
|
return len(data), data[start:], nil
|
|
|
|
}
|
|
|
|
// Request more data.
|
|
|
|
return start, nil, nil
|
|
|
|
}
|
|
|
|
|
2016-12-11 04:14:03 +00:00
|
|
|
type tokenPair [2]string
|
|
|
|
|
2021-09-15 08:28:09 +00:00
|
|
|
// DefaultMarkovMap is a Markov chain based on src.
|
2022-07-26 05:46:04 +00:00
|
|
|
var DefaultMarkovMap MarkovMap
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
src, _ := squish.UnpackStr(srcGz)
|
|
|
|
DefaultMarkovMap = MakeMarkovMap(strings.NewReader(src))
|
|
|
|
}
|
2016-12-11 01:38:18 +00:00
|
|
|
|
2016-12-12 00:15:14 +00:00
|
|
|
// MarkovMap is a map that acts as a Markov chain generator.
|
2016-12-11 04:14:03 +00:00
|
|
|
type MarkovMap map[tokenPair][]string
|
2016-12-11 01:38:18 +00:00
|
|
|
|
2016-12-12 00:15:14 +00:00
|
|
|
// MakeMarkovMap makes an empty MakeMarkov and fills it with r.
|
2016-12-11 04:14:03 +00:00
|
|
|
func MakeMarkovMap(r io.Reader) MarkovMap {
|
|
|
|
m := MarkovMap{}
|
|
|
|
m.Fill(r)
|
|
|
|
return m
|
|
|
|
}
|
2016-12-11 01:38:18 +00:00
|
|
|
|
2016-12-12 00:15:14 +00:00
|
|
|
// Fill adds all the tokens in r to a MarkovMap
|
2016-12-11 04:14:03 +00:00
|
|
|
func (mm MarkovMap) Fill(r io.Reader) {
|
|
|
|
var w1, w2, w3 string
|
2016-12-11 01:38:18 +00:00
|
|
|
|
2016-12-11 04:14:03 +00:00
|
|
|
s := bufio.NewScanner(r)
|
2016-12-11 01:38:18 +00:00
|
|
|
s.Split(ScanHTML)
|
|
|
|
for s.Scan() {
|
2016-12-11 04:14:03 +00:00
|
|
|
w3 := s.Text()
|
|
|
|
mm.Add(w1, w2, w3)
|
|
|
|
w1, w2 = w2, w3
|
|
|
|
}
|
|
|
|
|
|
|
|
mm.Add(w1, w2, w3)
|
|
|
|
}
|
|
|
|
|
2016-12-12 00:15:14 +00:00
|
|
|
// Add adds a three token sequence to the map.
|
2016-12-11 04:14:03 +00:00
|
|
|
func (mm MarkovMap) Add(w1, w2, w3 string) {
|
|
|
|
p := tokenPair{w1, w2}
|
|
|
|
mm[p] = append(mm[p], w3)
|
|
|
|
}
|
|
|
|
|
2016-12-12 00:21:35 +00:00
|
|
|
// Get pseudo-randomly chooses a possible suffix to w1 and w2.
|
2016-12-11 04:14:03 +00:00
|
|
|
func (mm MarkovMap) Get(w1, w2 string) string {
|
|
|
|
p := tokenPair{w1, w2}
|
|
|
|
suffix, ok := mm[p]
|
|
|
|
if !ok {
|
|
|
|
return ""
|
2016-12-11 01:38:18 +00:00
|
|
|
}
|
|
|
|
|
2022-04-21 02:47:40 +00:00
|
|
|
// We don't care about cryptographically sound entropy here, ignore gosec G404.
|
|
|
|
/* #nosec */
|
2016-12-11 04:14:03 +00:00
|
|
|
r := rand.Intn(len(suffix))
|
|
|
|
return suffix[r]
|
2016-12-11 01:38:18 +00:00
|
|
|
}
|
|
|
|
|
2016-12-12 00:15:14 +00:00
|
|
|
// Read fills p with data from calling Get on the MarkovMap.
|
2016-12-11 04:14:03 +00:00
|
|
|
func (mm MarkovMap) Read(p []byte) (n int, err error) {
|
|
|
|
var w1, w2, w3 string
|
2016-12-11 01:38:18 +00:00
|
|
|
|
2016-12-11 04:14:03 +00:00
|
|
|
for {
|
|
|
|
w3 = mm.Get(w1, w2)
|
|
|
|
if n+len(w3)+1 >= len(p) {
|
2016-12-11 01:38:18 +00:00
|
|
|
break
|
|
|
|
}
|
2016-12-11 04:14:03 +00:00
|
|
|
n += copy(p[n:], w3)
|
|
|
|
n += copy(p[n:], "\n")
|
|
|
|
w1, w2 = w2, w3
|
2016-12-11 01:38:18 +00:00
|
|
|
}
|
2016-12-11 04:14:03 +00:00
|
|
|
|
|
|
|
return
|
2016-12-11 01:38:18 +00:00
|
|
|
}
|