341 lines
8.0 KiB
Go
341 lines
8.0 KiB
Go
package service
|
||
|
||
import (
|
||
"bufio"
|
||
"bytes"
|
||
"context"
|
||
"fmt"
|
||
"io"
|
||
"io/ioutil"
|
||
"os"
|
||
"os/exec"
|
||
"path/filepath"
|
||
"strconv"
|
||
"strings"
|
||
"sync/atomic"
|
||
"time"
|
||
|
||
"github.com/siddontang/go/ioutil2"
|
||
|
||
"go-common/app/service/live/recommend/recconst"
|
||
|
||
"go-common/app/job/live/recommend-job/internal/conf"
|
||
"go-common/library/cache/redis"
|
||
"go-common/library/log"
|
||
"go-common/library/sync/errgroup"
|
||
|
||
"github.com/pkg/errors"
|
||
)
|
||
|
||
// ItemCFJob 把 hadoop 的数据写到redis
|
||
type ItemCFJob struct {
|
||
Conf *conf.JobConfig
|
||
RedisConf *redis.Config
|
||
HadoopConf *conf.HadoopConfig
|
||
}
|
||
|
||
// Run ...
|
||
func (j *ItemCFJob) Run() {
|
||
log.Info("Run ItemCFJob")
|
||
processFile(j.Conf, j.HadoopConf, j.RedisConf, writeItemCFToRedis)
|
||
log.Info("ItemCFJob Done")
|
||
}
|
||
|
||
func fetchFiles(jobConf *conf.JobConfig,
|
||
hadoopConf *conf.HadoopConfig) (paths []string, err error) {
|
||
path := jobConf.InputFile
|
||
if path == "" {
|
||
var javaHome string
|
||
var hadoopHome string
|
||
hadoopHome, javaHome, err = downloadHadoop(hadoopConf)
|
||
if err != nil {
|
||
log.Info("download hadoop err %+v ", err)
|
||
return
|
||
}
|
||
log.Info("download hadoop success: " + hadoopHome)
|
||
path, err = downloadFileFromHadoop(jobConf, hadoopHome, javaHome)
|
||
if err != nil {
|
||
log.Info("path %s downloaded err %+v ", path, err)
|
||
return
|
||
}
|
||
log.Info("path downloaded success: " + path)
|
||
|
||
var file os.FileInfo
|
||
file, err = os.Stat(path)
|
||
if err != nil {
|
||
log.Error("cannot open file %s err: %+v", file, err)
|
||
return
|
||
}
|
||
if file.IsDir() {
|
||
var files []os.FileInfo
|
||
files, err = ioutil.ReadDir(path)
|
||
if err != nil {
|
||
log.Error("no file in dir: %d, err: %+v", path, err)
|
||
return
|
||
}
|
||
|
||
for _, f := range files {
|
||
if strings.Index(f.Name(), ".") != 0 && strings.Index(f.Name(), "_") != 0 {
|
||
paths = append(paths, path+"/"+f.Name())
|
||
}
|
||
}
|
||
} else {
|
||
paths = []string{path}
|
||
}
|
||
} else {
|
||
if strings.Index(path, "http://") == 0 {
|
||
var httpPath = path
|
||
path, err = downloadFileFromHttp(httpPath, "/tmp/job-downloaded.txt")
|
||
if err != nil {
|
||
log.Error("download from http path=%s, error=%+v", httpPath, err)
|
||
return
|
||
}
|
||
log.Info("file downloaded from http %s to %s", httpPath, path)
|
||
} else {
|
||
var file os.FileInfo
|
||
file, err = os.Stat(path)
|
||
if err != nil {
|
||
log.Error("cannot open file %s err: %+v", file, err)
|
||
return
|
||
}
|
||
if file.IsDir() {
|
||
var files []os.FileInfo
|
||
files, err = ioutil.ReadDir(path)
|
||
if err != nil {
|
||
log.Error("no file in dir: %d, err: %+v", path, err)
|
||
return
|
||
}
|
||
|
||
for _, f := range files {
|
||
if strings.Index(f.Name(), ".") != 0 && strings.Index(f.Name(), "_") != 0 {
|
||
paths = append(paths, path+"/"+f.Name())
|
||
}
|
||
}
|
||
} else {
|
||
paths = []string{path}
|
||
}
|
||
}
|
||
}
|
||
return
|
||
}
|
||
|
||
// 按行读取文件,并调用handler处理
|
||
func processFile(jobConf *conf.JobConfig,
|
||
hadoopConf *conf.HadoopConfig,
|
||
redisConf *redis.Config,
|
||
handler func(line string, pool *redis.Pool) error,
|
||
) {
|
||
paths, err := fetchFiles(jobConf, hadoopConf)
|
||
if err != nil {
|
||
return
|
||
}
|
||
var workerNum = jobConf.WorkerNum
|
||
var r = redis.NewPool(redisConf)
|
||
defer r.Close()
|
||
|
||
log.Info("all of files %+v", paths)
|
||
for _, path := range paths {
|
||
|
||
var startLineNum int
|
||
|
||
file, err := os.Open(path)
|
||
if err != nil {
|
||
log.Error("open path %s err %+v", path, errors.WithStack(err))
|
||
panic(err)
|
||
}
|
||
|
||
name := filepath.Base(filepath.Dir(path)) + "-" + filepath.Base(path)
|
||
offsetPath := "/tmp/" + name + ".offset"
|
||
// 读取上一次的位置
|
||
contentB, e := ioutil.ReadFile(offsetPath)
|
||
if e == nil {
|
||
content := string(contentB)
|
||
offset, e := strconv.Atoi(content)
|
||
if e == nil {
|
||
startLineNum = offset
|
||
} else {
|
||
startLineNum = 1
|
||
}
|
||
} else {
|
||
startLineNum = 1
|
||
}
|
||
log.Info("start from line: %d, file : %s offset file: %s", startLineNum, path, offsetPath)
|
||
|
||
var ops uint64
|
||
totalCount := lineCounter(path)
|
||
scanner := bufio.NewScanner(file)
|
||
|
||
g := errgroup.Group{}
|
||
g.GOMAXPROCS(workerNum)
|
||
|
||
var lineNum = 0
|
||
for scanner.Scan() {
|
||
lineNum++
|
||
if lineNum < startLineNum {
|
||
continue
|
||
}
|
||
|
||
line := scanner.Text() // 10001 [1:0.9,2:0.9]
|
||
g.Go(func() error {
|
||
handler(line, r)
|
||
atomic.AddUint64(&ops, 1)
|
||
if ops%20000 == 0 {
|
||
fmt.Printf("progress %d / %d percent %f %s \r", ops+uint64(startLineNum)-1, totalCount,
|
||
float32(uint64(startLineNum)+ops-1)/float32(totalCount)*100, "%")
|
||
} else if ops == uint64(totalCount) {
|
||
fmt.Printf("progress %d / %d 100%%\n", ops, totalCount)
|
||
}
|
||
return nil
|
||
})
|
||
if lineNum%100000 == 0 {
|
||
g.Wait()
|
||
ioutil.WriteFile(offsetPath,
|
||
[]byte(strconv.FormatInt(int64(lineNum), 10)),
|
||
os.ModePerm)
|
||
g = errgroup.Group{}
|
||
g.GOMAXPROCS(workerNum)
|
||
}
|
||
}
|
||
|
||
g.Wait()
|
||
|
||
if err = scanner.Err(); err != nil {
|
||
log.Error("err %+v", errors.WithStack(err))
|
||
panic(err)
|
||
}
|
||
file.Close()
|
||
}
|
||
|
||
}
|
||
|
||
func writeItemCFToRedis(line string, r *redis.Pool) (err error) {
|
||
start := strings.Index(line, "[")
|
||
end := strings.LastIndex(line, "]")
|
||
userIdStr := line[0 : start-1]
|
||
items := strings.Split(line[start+1:end], ",")
|
||
c := r.Get(context.Background())
|
||
defer c.Close()
|
||
userId, _ := strconv.Atoi(userIdStr)
|
||
key := fmt.Sprintf(recconst.UserItemCFRecKey, userId)
|
||
// 最多保留50个推荐
|
||
var inMemoryCount int
|
||
inMemoryCount, err = redis.Int(c.Do("ZCARD", key))
|
||
if err != nil {
|
||
log.Error("zcard err: %+v", err)
|
||
} else {
|
||
var toBeRemovedCount = inMemoryCount + len(items) - 60
|
||
if toBeRemovedCount > 0 {
|
||
var removed int
|
||
removed, err = redis.Int(c.Do("ZREMRANGEBYRANK", key, 0, toBeRemovedCount-1))
|
||
if err != nil {
|
||
log.Error("ZREMRANGEBYRANK key:%s, err: +%v", key, err)
|
||
} else {
|
||
log.Info("zset removed %d count, key:%s", removed, key)
|
||
}
|
||
}
|
||
}
|
||
for _, item := range items {
|
||
split := strings.Split(item, ":")
|
||
itemID := split[0]
|
||
score := split[1]
|
||
|
||
c.Send("ZADD", key, score, itemID)
|
||
}
|
||
c.Send("EXPIRE", key, 86400*30)
|
||
err = c.Flush()
|
||
|
||
if err != nil {
|
||
log.Error("zadd to redis error: %+v , key=%s", err, key)
|
||
return err
|
||
}
|
||
for i := 0; i < len(items)+1; i++ {
|
||
_, err = c.Receive()
|
||
if err != nil {
|
||
log.Error("zadd to redis error: %+v , key=%s, line=%s", err, key, line)
|
||
return err
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func lineCounter(path string) int {
|
||
buf := make([]byte, 32*1024)
|
||
r, _ := os.Open(path)
|
||
defer r.Close()
|
||
count := 0
|
||
lineSep := []byte{'\n'}
|
||
|
||
for {
|
||
c, err := r.Read(buf)
|
||
count += bytes.Count(buf[:c], lineSep)
|
||
|
||
switch {
|
||
case err == io.EOF:
|
||
return count
|
||
|
||
case err != nil:
|
||
return count
|
||
}
|
||
}
|
||
}
|
||
|
||
func downloadHadoop(hadoopConf *conf.HadoopConfig) (hadoopHome string, javaHome string, err error) {
|
||
if strings.LastIndex(hadoopConf.HadoopDir, "/") == len(hadoopConf.HadoopDir)-1 {
|
||
hadoopHome = hadoopConf.HadoopDir + "hadoop-2.8.4"
|
||
} else {
|
||
hadoopHome = hadoopConf.HadoopDir + "/hadoop-2.8.4"
|
||
}
|
||
javaHome = hadoopHome + "/jdk1.8.0_60"
|
||
|
||
if ioutil2.FileExists(hadoopHome) {
|
||
return
|
||
}
|
||
var cmd = "curl -sSLf " + hadoopConf.TarUrl + " -o /tmp/hadoop.tar.gz"
|
||
err = runCmd(cmd)
|
||
if err != nil {
|
||
return
|
||
}
|
||
cmd = "tar -C " + hadoopConf.HadoopDir + " -xf /tmp/hadoop.tar.gz"
|
||
err = runCmd(cmd)
|
||
if err != nil {
|
||
return
|
||
}
|
||
|
||
return
|
||
}
|
||
|
||
func downloadFileFromHttp(url string, output string) (string, error) {
|
||
var localPath = output
|
||
var cmd = "curl -sSLf " + url + " -o " + localPath
|
||
var err = runCmd(cmd)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
return localPath, nil
|
||
}
|
||
|
||
func downloadFileFromHadoop(jobConf *conf.JobConfig, hadoopHome string, javaHome string) (string, error) {
|
||
t := time.Now().AddDate(0, 0, -1)
|
||
day := t.Format("20060102")
|
||
localPath := fmt.Sprintf(jobConf.LocalTmpFile, day)
|
||
if ioutil2.FileExists(localPath) {
|
||
return localPath, nil
|
||
}
|
||
remotePath := fmt.Sprintf(jobConf.HadoopFile, day)
|
||
cmd := fmt.Sprintf("export JAVA_HOME=%s; %s/bin/hdfs dfs -get %s %s",
|
||
javaHome, hadoopHome, remotePath, localPath)
|
||
err := runCmd(cmd)
|
||
return localPath, err
|
||
}
|
||
|
||
// runCmd runs the cmd & print output (both stdout & stderr)
|
||
func runCmd(cmd string) (err error) {
|
||
fmt.Printf("CMD: %s \n", cmd)
|
||
out, err := exec.Command("/bin/bash", "-c", cmd).CombinedOutput()
|
||
log.Info(string(out))
|
||
if err != nil {
|
||
err = errors.Wrap(err, string(out))
|
||
}
|
||
return
|
||
}
|