Create & Init Project...

This commit is contained in:
2019-04-22 18:49:16 +08:00
commit fc4fa37393
25440 changed files with 4054998 additions and 0 deletions

View File

@@ -0,0 +1,44 @@
package(default_visibility = ["//visibility:public"])
load(
"@io_bazel_rules_go//go:def.bzl",
"go_test",
"go_library",
)
go_test(
name = "go_default_test",
srcs = ["opencc_test.go"],
embed = [":go_default_library"],
tags = ["automanaged"],
)
go_library(
name = "go_default_library",
srcs = [
"constant.go",
"dict.go",
"opencc.go",
],
importpath = "go-common/library/text/translate/chinese",
tags = ["automanaged"],
visibility = ["//visibility:public"],
deps = [
"//library/log:go_default_library",
"//vendor/github.com/go-ego/cedar:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@@ -0,0 +1,44 @@
# opencc - Golang version OpenCC
## Introduction 介紹
opencc is a golang port of OpenCC([Open Chinese Convert 開放中文轉換](https://github.com/BYVoid/OpenCC/)) which is a project for conversion between Traditional and Simplified Chinese developed by [BYVoid](https://www.byvoid.com/).
opencc stands for "**Go**lang version Open**CC**", it is a total rewrite version of OpenCC in Go. It just borrows the dict files and config files of OpenCC, so it may not produce the same output with the original OpenCC.
## Usage 使用
```go
package main
import (
"fmt"
"log"
"context"
"go-common/library/text/translate/chinese"
)
func main() {
chinese.Init()
in := `请不要怀疑,这是一个由人工智能推荐的频道。`
out, err := chinese.Convert(context.Background(),in)
if err != nil {
log.Fatal(err)
}
fmt.Printf("%s:%s\n", in, out)
}
// 请不要怀疑,这是一个由人工智能推荐的频道。
// 請不要懷疑,這是一個由人工智慧推薦的頻道。
```
## Conversions
* `s2t` Simplified Chinese to Traditional Chinese
* `t2s` Traditional Chinese to Simplified Chinese
* `s2tw` Simplified Chinese to Traditional Chinese (Taiwan Standard)
* `tw2s` Traditional Chinese (Taiwan Standard) to Simplified Chinese
* `s2hk` Simplified Chinese to Traditional Chinese (Hong Kong Standard)
* `hk2s` Traditional Chinese (Hong Kong Standard) to Simplified Chinese
* `s2twp` Simplified Chinese to Traditional Chinese (Taiwan Standard) with Taiwanese idiom
* `tw2sp` Traditional Chinese (Taiwan Standard) to Simplified Chinese with Mainland Chinese idiom
* `t2tw` Traditional Chinese (OpenCC Standard) to Taiwan Standard
* `t2hk` Traditional Chinese (OpenCC Standard) to Hong Kong Standard

View File

@@ -0,0 +1,286 @@
package chinese
var (
hk2s = `{
"name": "Traditional Chinese (Hong Kong standard) to Simplified Chinese",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "txt",
"file": "TSPhrases.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "HKVariantsRevPhrases.txt"
}, {
"type": "txt",
"file": "HKVariantsRev.txt"
}]
}
}, {
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "TSPhrases.txt"
}, {
"type": "txt",
"file": "TSCharacters.txt"
}]
}
}]
}
`
s2hk = `{
"name": "Simplified Chinese to Traditional Chinese (Hong Kong standard)",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "txt",
"file": "STPhrases.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "STPhrases.txt"
}, {
"type": "txt",
"file": "STCharacters.txt"
}]
}
}, {
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "HKVariantsPhrases.txt"
}, {
"type": "txt",
"file": "HKVariants.txt"
}]
}
}]
}
`
s2t = `{
"name": "Simplified Chinese to Traditional Chinese",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "txt",
"file": "STPhrases.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "STPhrases.txt"
}, {
"type": "txt",
"file": "STCharacters.txt"
}]
}
}]
}
`
s2tw = `{
"name": "Simplified Chinese to Traditional Chinese (Taiwan standard)",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "txt",
"file": "STPhrases.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "STPhrases.txt"
}, {
"type": "txt",
"file": "STCharacters.txt"
}]
}
}, {
"dict": {
"type": "txt",
"file": "TWVariants.txt"
}
}]
}
`
s2twp = `{
"name": "Simplified Chinese to Traditional Chinese (Taiwan standard, with phrases)",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "txt",
"file": "STPhrases.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "STPhrases.txt"
}, {
"type": "txt",
"file": "STCharacters.txt"
}]
}
}, {
"dict": {
"type": "txt",
"file": "TWPhrases.txt"
}
}, {
"dict": {
"type": "txt",
"file": "TWVariants.txt"
}
}]
}
`
t2hk = `{
"name": "Traditional Chinese to Traditional Chinese (Hong Kong standard)",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "txt",
"file": "HKVariants.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "txt",
"file": "HKVariants.txt"
}
}]
}
`
t2s = `{
"name": "Traditional Chinese to Simplified Chinese",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "txt",
"file": "TSPhrases.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "TSPhrases.txt"
}, {
"type": "txt",
"file": "TSCharacters.txt"
}]
}
}]
}
`
t2tw = `{
"name": "Traditional Chinese to Traditional Chinese (Taiwan standard)",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "txt",
"file": "TWVariants.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "txt",
"file": "TWVariants.txt"
}
}]
}
`
tw2s = `{
"name": "Traditional Chinese (Taiwan standard) to Simplified Chinese",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "txt",
"file": "TSPhrases.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "TWVariantsRevPhrases.txt"
}, {
"type": "txt",
"file": "TWVariantsRev.txt"
}]
}
}, {
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "TSPhrases.txt"
}, {
"type": "txt",
"file": "TSCharacters.txt"
}]
}
}]
}
`
tw2sp = `{
"name": "Traditional Chinese (Taiwan standard) to Simplified Chinese (with phrases)",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "txt",
"file": "TSPhrases.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "TWVariantsRevPhrases.txt"
}, {
"type": "txt",
"file": "TWVariantsRev.txt"
}]
}
}, {
"dict": {
"type": "txt",
"file": "TWPhrasesRev.txt"
}
}, {
"dict": {
"type": "group",
"dicts": [{
"type": "txt",
"file": "TSPhrases.txt"
}, {
"type": "txt",
"file": "TSCharacters.txt"
}]
}
}]
}
`
)

View File

@@ -0,0 +1,106 @@
package chinese
import (
"bytes"
"fmt"
"io"
"net/http"
"strings"
"time"
"go-common/library/log"
"github.com/go-ego/cedar"
)
// dict contains the Trie and dict values
type dict struct {
Trie *cedar.Cedar
Values [][]string
}
// BuildFromFile builds the da dict from fileName
func buildFromFile(fileName string) (*dict, error) {
var err error
trie := cedar.New()
values := [][]string{}
bs := raw(fileName)
strs := strings.Split(string(bs), "\n")
for _, line := range strs {
items := strings.Split(strings.TrimSpace(line), "\t")
if len(items) < 2 {
continue
}
err = trie.Insert([]byte(items[0]), len(values))
if err != nil {
return nil, err
}
if len(items) > 2 {
values = append(values, items[1:])
} else {
values = append(values, strings.Fields(items[1]))
}
}
return &dict{Trie: trie, Values: values}, nil
}
// prefixMatch str by Dict, returns the matched string and its according values
func (d *dict) prefixMatch(str string) (map[string][]string, error) {
if d.Trie == nil {
return nil, fmt.Errorf("Trie is nil")
}
res := make(map[string][]string)
for _, id := range d.Trie.PrefixMatch([]byte(str), 0) {
key, err := d.Trie.Key(id)
if err != nil {
return nil, err
}
value, err := d.Trie.Value(id)
if err != nil {
return nil, err
}
res[string(key)] = d.Values[value]
}
return res, nil
}
var (
defaultRead int64 = 16 * 1024 // 16kb
defaultURL = "http://i0.hdslb.com/bfs/static/"
)
func raw(file string) (bs []byte) {
client := http.Client{Timeout: 10 * time.Second}
for i := 0; i < 3; i++ {
resp, err := client.Get(defaultURL + file)
if err != nil || resp.StatusCode != http.StatusOK {
log.Error("bfs client url:%s file:%s err:%+v", defaultURL, file, err)
time.Sleep(time.Millisecond * 50)
continue
}
defer resp.Body.Close()
bs, err = readAll(resp.Body, defaultRead)
if err == nil {
return
}
log.Error("bfs client url:%s file:%s err:%+v", defaultURL, file, err)
}
return
}
func readAll(r io.Reader, capacity int64) (b []byte, err error) {
buf := bytes.NewBuffer(make([]byte, 0, capacity))
defer func() {
e := recover()
if e == nil {
return
}
if panicErr, ok := e.(error); ok && panicErr == bytes.ErrTooLarge {
err = panicErr
} else {
panic(e)
}
}()
_, err = buf.ReadFrom(r)
return buf.Bytes(), err
}

View File

@@ -0,0 +1,181 @@
package chinese
import (
"context"
"encoding/json"
"fmt"
"strings"
"go-common/library/log"
)
var (
defaultConversion = "s2twp"
)
// Group holds a sequence of dicts
type Group struct {
Files []string
Dicts []*dict
}
func (g *Group) String() string {
return fmt.Sprintf("%+v", g.Files)
}
// OpenCC contains the converter
type openCC struct {
Conversion string
Description string
DictGroup []*Group
}
var conversions = map[string]*openCC{
"s2twp": {Conversion: s2twp},
// "hk2s": {Conversion: hk2s}, "s2hk": {Conversion: s2hk}, "s2t": {Conversion: s2t},
// "s2tw": {Conversion: s2tw}, "t2hk": {Conversion: t2hk},
// "t2s": {Conversion: t2s}, "t2tw": {Conversion: t2tw},
// "tw2s": {Conversion: tw2s}, "tw2sp": {Conversion: tw2sp},
}
// Init construct an instance of OpenCC.
func Init() {
for k, v := range conversions {
if err := v.dict(k); err != nil {
panic(err)
}
}
}
// Converts .
func Converts(ctx context.Context, in ...string) (out map[string]string) {
var err error
out = make(map[string]string, len(in))
for _, v := range in {
if out[v], err = convert(v, defaultConversion); err != nil {
log.Error("convert(%s),err:%+v", in, err)
out[v] = v
}
}
return
}
// Convert string from Simplified Chinese to Traditional Chinese .
func Convert(ctx context.Context, in string) (out string) {
var err error
if out, err = convert(in, defaultConversion); err != nil {
log.Error("convert(%s),err:%+v", in, err)
}
return
}
func (cc *openCC) dict(conversion string) error {
var m interface{}
json.Unmarshal([]byte(cc.Conversion), &m)
config := m.(map[string]interface{})
cc.Description = config["name"].(string)
dictChain, ok := config["conversion_chain"].([]interface{})
if !ok {
return fmt.Errorf("format %+v not correct", config)
}
for _, v := range dictChain {
d, ok := v.(map[string]interface{})
if !ok {
return fmt.Errorf("should be map inside conversion_chain")
}
dictMap, ok := d["dict"]
if !ok {
return fmt.Errorf("should have dict inside conversion_chain")
}
if dict, ok := dictMap.(map[string]interface{}); ok {
group, err := cc.group(dict)
if err != nil {
return err
}
cc.DictGroup = append(cc.DictGroup, group)
}
}
return nil
}
func (cc *openCC) group(d map[string]interface{}) (*Group, error) {
typ, ok := d["type"].(string)
if !ok {
return nil, fmt.Errorf("type should be string")
}
res := &Group{}
switch typ {
case "group":
dicts, ok := d["dicts"].([]interface{})
if !ok {
return nil, fmt.Errorf("dicts field invalid")
}
for _, dict := range dicts {
d, ok := dict.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("dicts items invalid")
}
group, err := cc.group(d)
if err != nil {
return nil, err
}
res.Files = append(res.Files, group.Files...)
res.Dicts = append(res.Dicts, group.Dicts...)
}
case "txt":
file, ok := d["file"]
if !ok {
return nil, fmt.Errorf("no file field found")
}
daDict, err := buildFromFile(file.(string))
if err != nil {
return nil, err
}
res.Files = append(res.Files, file.(string))
res.Dicts = append(res.Dicts, daDict)
default:
return nil, fmt.Errorf("type should be txt or group")
}
return res, nil
}
// convert string from Simplified Chinese to Traditional Chinese or vice versa
func convert(in, conversion string) (string, error) {
if conversion == "" {
conversion = defaultConversion
}
for _, group := range conversions[conversion].DictGroup {
r := []rune(in)
var tokens []string
for i := 0; i < len(r); {
s := r[i:]
var token string
max := 0
for _, dict := range group.Dicts {
ret, err := dict.prefixMatch(string(s))
if err != nil {
return "", err
}
if len(ret) > 0 {
o := ""
for k, v := range ret {
if len(k) > max {
max = len(k)
token = v[0]
o = k
}
}
i += len([]rune(o))
break
}
}
if max == 0 { //no match
token = string(r[i])
i++
}
tokens = append(tokens, token)
}
in = strings.Join(tokens, "")
}
return in, nil
}

View File

@@ -0,0 +1,52 @@
package chinese
import (
"context"
"testing"
)
func TestConvert(t *testing.T) {
Init()
in := `请不要怀疑,这是一个由人工智能推荐的频道。`
out := Convert(context.Background(), in)
t.Logf("in:%s,out:%s", in, out)
in = `说起来你可能不信,我是考试考进来的`
out = Convert(context.Background(), in)
t.Logf("in:%s,out:%s", in, out)
}
func BenchmarkConvert(b *testing.B) {
var testcase = []string{
"说起来你可能不信,我是考试考进来的",
"说起来你可能不信,我是花钱找关系进来的",
"请不要怀疑,这是一个由人工智能推荐的频道",
"我开挖掘机拆屋的时候听特别带感",
"1990年真实记录当时的秋名山的日常",
"1990年藤原豆腐店成了连锁店 没错这些车都是送豆腐的",
"Go语言,从底层到应用视Golang的环境搭建、基础知识、进阶知识、项目实践、Redis基础及其项目实践海量用户通讯系统、算法与数据结构基础知识的golang实现。",
}
Init()
for i := 0; i < b.N; i++ {
out := Convert(context.Background(), testcase[i%len(testcase)])
b.Logf("in:%s,out:%s", testcase[i%len(testcase)], out)
}
}
func BenchmarkConverts(b *testing.B) {
var testcase = []string{
"说起来你可能不信,我是考试考进来的",
"说起来你可能不信,我是花钱找关系进来的",
"请不要怀疑,这是一个由人工智能推荐的频道",
"我开挖掘机拆屋的时候听特别带感",
"1990年真实记录当时的秋名山的日常",
"1990年藤原豆腐店成了连锁店 没错这些车都是送豆腐的",
"Go语言,从底层到应用视Golang的环境搭建、基础知识、进阶知识、项目实践、Redis基础及其项目实践海量用户通讯系统、算法与数据结构基础知识的golang实现。",
}
Init()
var out map[string]string
for i := 0; i < b.N; i++ {
out = Converts(context.Background(), testcase...)
}
b.Logf("in:%s,out:%s", testcase, out)
}