go-common/library/text/translate/chinese/dict.go
2019-04-22 18:49:16 +08:00

107 lines
2.3 KiB
Go

package chinese
import (
"bytes"
"fmt"
"io"
"net/http"
"strings"
"time"
"go-common/library/log"
"github.com/go-ego/cedar"
)
// dict contains the Trie and dict values
type dict struct {
Trie *cedar.Cedar
Values [][]string
}
// BuildFromFile builds the da dict from fileName
func buildFromFile(fileName string) (*dict, error) {
var err error
trie := cedar.New()
values := [][]string{}
bs := raw(fileName)
strs := strings.Split(string(bs), "\n")
for _, line := range strs {
items := strings.Split(strings.TrimSpace(line), "\t")
if len(items) < 2 {
continue
}
err = trie.Insert([]byte(items[0]), len(values))
if err != nil {
return nil, err
}
if len(items) > 2 {
values = append(values, items[1:])
} else {
values = append(values, strings.Fields(items[1]))
}
}
return &dict{Trie: trie, Values: values}, nil
}
// prefixMatch str by Dict, returns the matched string and its according values
func (d *dict) prefixMatch(str string) (map[string][]string, error) {
if d.Trie == nil {
return nil, fmt.Errorf("Trie is nil")
}
res := make(map[string][]string)
for _, id := range d.Trie.PrefixMatch([]byte(str), 0) {
key, err := d.Trie.Key(id)
if err != nil {
return nil, err
}
value, err := d.Trie.Value(id)
if err != nil {
return nil, err
}
res[string(key)] = d.Values[value]
}
return res, nil
}
var (
defaultRead int64 = 16 * 1024 // 16kb
defaultURL = "http://i0.hdslb.com/bfs/static/"
)
func raw(file string) (bs []byte) {
client := http.Client{Timeout: 10 * time.Second}
for i := 0; i < 3; i++ {
resp, err := client.Get(defaultURL + file)
if err != nil || resp.StatusCode != http.StatusOK {
log.Error("bfs client url:%s file:%s err:%+v", defaultURL, file, err)
time.Sleep(time.Millisecond * 50)
continue
}
defer resp.Body.Close()
bs, err = readAll(resp.Body, defaultRead)
if err == nil {
return
}
log.Error("bfs client url:%s file:%s err:%+v", defaultURL, file, err)
}
return
}
func readAll(r io.Reader, capacity int64) (b []byte, err error) {
buf := bytes.NewBuffer(make([]byte, 0, capacity))
defer func() {
e := recover()
if e == nil {
return
}
if panicErr, ok := e.(error); ok && panicErr == bytes.ErrTooLarge {
err = panicErr
} else {
panic(e)
}
}()
_, err = buf.ReadFrom(r)
return buf.Bytes(), err
}