97d145a9d3
When a related runningTask doesn't exist and the executor task status is running just report it as failed ignoring if it's marked to stop.
1476 lines
36 KiB
Go
1476 lines
36 KiB
Go
// Copyright 2019 Sorint.lab
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package executor
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"net"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"agola.io/agola/internal/common"
|
|
slog "agola.io/agola/internal/log"
|
|
"agola.io/agola/internal/services/config"
|
|
"agola.io/agola/internal/services/executor/driver"
|
|
"agola.io/agola/internal/services/executor/registry"
|
|
"agola.io/agola/internal/util"
|
|
rsclient "agola.io/agola/services/runservice/client"
|
|
"agola.io/agola/services/runservice/types"
|
|
uuid "github.com/satori/go.uuid"
|
|
|
|
"github.com/gorilla/mux"
|
|
sockaddr "github.com/hashicorp/go-sockaddr"
|
|
"go.uber.org/zap"
|
|
"go.uber.org/zap/zapcore"
|
|
errors "golang.org/x/xerrors"
|
|
)
|
|
|
|
var level = zap.NewAtomicLevelAt(zapcore.InfoLevel)
|
|
var logger = slog.New(level)
|
|
var log = logger.Sugar()
|
|
|
|
const (
|
|
defaultShell = "/bin/sh -e"
|
|
|
|
toolboxContainerDir = "/mnt/agola"
|
|
)
|
|
|
|
var (
|
|
toolboxContainerPath = filepath.Join(toolboxContainerDir, "/agola-toolbox")
|
|
)
|
|
|
|
func (e *Executor) getAllPods(ctx context.Context, all bool) ([]driver.Pod, error) {
|
|
return e.driver.GetPods(ctx, all)
|
|
}
|
|
|
|
func stepUser(t *types.ExecutorTask) string {
|
|
// use the container specified user and override with task user if defined
|
|
user := t.Spec.Containers[0].User
|
|
if t.Spec.User != "" {
|
|
user = t.Spec.User
|
|
}
|
|
|
|
return user
|
|
}
|
|
|
|
func (e *Executor) createFile(ctx context.Context, pod driver.Pod, command, user string, outf io.Writer) (string, error) {
|
|
cmd := []string{toolboxContainerPath, "createfile"}
|
|
|
|
var buf bytes.Buffer
|
|
execConfig := &driver.ExecConfig{
|
|
Cmd: cmd,
|
|
User: user,
|
|
AttachStdin: true,
|
|
Stdout: &buf,
|
|
Stderr: outf,
|
|
}
|
|
|
|
ce, err := pod.Exec(ctx, execConfig)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
stdin := ce.Stdin()
|
|
go func() {
|
|
_, _ = io.WriteString(stdin, command+"\n")
|
|
stdin.Close()
|
|
}()
|
|
|
|
exitCode, err := ce.Wait(ctx)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if exitCode != 0 {
|
|
return "", errors.Errorf("toolbox exited with code: %d", exitCode)
|
|
}
|
|
|
|
return buf.String(), nil
|
|
}
|
|
|
|
func (e *Executor) doRunStep(ctx context.Context, s *types.RunStep, t *types.ExecutorTask, pod driver.Pod, logPath string) (int, error) {
|
|
if err := os.MkdirAll(filepath.Dir(logPath), 0770); err != nil {
|
|
return -1, err
|
|
}
|
|
outf, err := os.Create(logPath)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
defer outf.Close()
|
|
|
|
// TODO(sgotti) this line is used only for old runconfig versions that don't
|
|
// set a task default shell in the runconfig
|
|
shell := defaultShell
|
|
if t.Spec.Shell != "" {
|
|
shell = t.Spec.Shell
|
|
}
|
|
if s.Shell != "" {
|
|
shell = s.Shell
|
|
}
|
|
|
|
var cmd []string
|
|
if s.Command != "" {
|
|
filename, err := e.createFile(ctx, pod, s.Command, stepUser(t), outf)
|
|
if err != nil {
|
|
return -1, errors.Errorf("create file err: %v", err)
|
|
}
|
|
|
|
args := strings.Split(shell, " ")
|
|
cmd = append(args, filename)
|
|
} else {
|
|
cmd = strings.Split(shell, " ")
|
|
}
|
|
|
|
// override task working dir with runstep working dir if provided
|
|
workingDir := t.Spec.WorkingDir
|
|
if s.WorkingDir != "" {
|
|
workingDir = s.WorkingDir
|
|
}
|
|
|
|
// generate the environment using the task environment and then overriding with the runstep environment
|
|
environment := map[string]string{}
|
|
for envName, envValue := range t.Spec.Environment {
|
|
environment[envName] = envValue
|
|
}
|
|
for envName, envValue := range s.Environment {
|
|
environment[envName] = envValue
|
|
}
|
|
|
|
workingDir, err = e.expandDir(ctx, t, pod, outf, workingDir)
|
|
if err != nil {
|
|
_, _ = outf.WriteString(fmt.Sprintf("failed to expand working dir %q. Error: %s\n", workingDir, err))
|
|
return -1, err
|
|
}
|
|
|
|
execConfig := &driver.ExecConfig{
|
|
Cmd: cmd,
|
|
Env: environment,
|
|
WorkingDir: workingDir,
|
|
User: stepUser(t),
|
|
AttachStdin: true,
|
|
Stdout: outf,
|
|
Stderr: outf,
|
|
Tty: *s.Tty,
|
|
}
|
|
|
|
ce, err := pod.Exec(ctx, execConfig)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
exitCode, err := ce.Wait(ctx)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
return exitCode, nil
|
|
}
|
|
|
|
func (e *Executor) doSaveToWorkspaceStep(ctx context.Context, s *types.SaveToWorkspaceStep, t *types.ExecutorTask, pod driver.Pod, logPath string, archivePath string) (int, error) {
|
|
cmd := []string{toolboxContainerPath, "archive"}
|
|
|
|
if err := os.MkdirAll(filepath.Dir(logPath), 0770); err != nil {
|
|
return -1, err
|
|
}
|
|
logf, err := os.Create(logPath)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
defer logf.Close()
|
|
|
|
if err := os.MkdirAll(filepath.Dir(archivePath), 0770); err != nil {
|
|
return -1, err
|
|
}
|
|
archivef, err := os.Create(archivePath)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
defer archivef.Close()
|
|
|
|
workingDir, err := e.expandDir(ctx, t, pod, logf, t.Spec.WorkingDir)
|
|
if err != nil {
|
|
_, _ = logf.WriteString(fmt.Sprintf("failed to expand working dir %q. Error: %s\n", t.Spec.WorkingDir, err))
|
|
return -1, err
|
|
}
|
|
|
|
execConfig := &driver.ExecConfig{
|
|
Cmd: cmd,
|
|
Env: t.Spec.Environment,
|
|
WorkingDir: workingDir,
|
|
User: stepUser(t),
|
|
AttachStdin: true,
|
|
Stdout: archivef,
|
|
Stderr: logf,
|
|
}
|
|
|
|
ce, err := pod.Exec(ctx, execConfig)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
type ArchiveInfo struct {
|
|
SourceDir string
|
|
DestDir string
|
|
Paths []string
|
|
}
|
|
type Archive struct {
|
|
ArchiveInfos []*ArchiveInfo
|
|
OutFile string
|
|
}
|
|
|
|
a := &Archive{
|
|
OutFile: "", // use stdout
|
|
ArchiveInfos: make([]*ArchiveInfo, len(s.Contents)),
|
|
}
|
|
|
|
for i, c := range s.Contents {
|
|
a.ArchiveInfos[i] = &ArchiveInfo{
|
|
SourceDir: c.SourceDir,
|
|
DestDir: c.DestDir,
|
|
Paths: c.Paths,
|
|
}
|
|
|
|
}
|
|
|
|
stdin := ce.Stdin()
|
|
enc := json.NewEncoder(stdin)
|
|
|
|
go func() {
|
|
_ = enc.Encode(a)
|
|
stdin.Close()
|
|
}()
|
|
|
|
exitCode, err := ce.Wait(ctx)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
return exitCode, nil
|
|
}
|
|
|
|
func (e *Executor) expandDir(ctx context.Context, t *types.ExecutorTask, pod driver.Pod, logf io.Writer, dir string) (string, error) {
|
|
args := []string{dir}
|
|
cmd := append([]string{toolboxContainerPath, "expanddir"}, args...)
|
|
|
|
// limit the template answer to max 1MiB
|
|
stdout := &bytes.Buffer{}
|
|
|
|
execConfig := &driver.ExecConfig{
|
|
Cmd: cmd,
|
|
Env: t.Spec.Environment,
|
|
User: stepUser(t),
|
|
AttachStdin: true,
|
|
Stdout: stdout,
|
|
Stderr: logf,
|
|
}
|
|
|
|
ce, err := pod.Exec(ctx, execConfig)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
exitCode, err := ce.Wait(ctx)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if exitCode != 0 {
|
|
return "", errors.Errorf("expanddir ended with exit code %d", exitCode)
|
|
}
|
|
|
|
return stdout.String(), nil
|
|
}
|
|
|
|
func (e *Executor) mkdir(ctx context.Context, t *types.ExecutorTask, pod driver.Pod, logf io.Writer, dir string) error {
|
|
args := []string{dir}
|
|
cmd := append([]string{toolboxContainerPath, "mkdir"}, args...)
|
|
|
|
execConfig := &driver.ExecConfig{
|
|
Cmd: cmd,
|
|
Env: t.Spec.Environment,
|
|
User: stepUser(t),
|
|
AttachStdin: true,
|
|
Stdout: logf,
|
|
Stderr: logf,
|
|
}
|
|
|
|
ce, err := pod.Exec(ctx, execConfig)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
exitCode, err := ce.Wait(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if exitCode != 0 {
|
|
return errors.Errorf("mkdir ended with exit code %d", exitCode)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (e *Executor) template(ctx context.Context, t *types.ExecutorTask, pod driver.Pod, logf io.Writer, key string) (string, error) {
|
|
cmd := []string{toolboxContainerPath, "template"}
|
|
|
|
// limit the template answer to max 1MiB
|
|
stdout := util.NewLimitedBuffer(1024 * 1024)
|
|
|
|
workingDir, err := e.expandDir(ctx, t, pod, logf, t.Spec.WorkingDir)
|
|
if err != nil {
|
|
_, _ = io.WriteString(logf, fmt.Sprintf("failed to expand working dir %q. Error: %s\n", t.Spec.WorkingDir, err))
|
|
return "", err
|
|
}
|
|
|
|
execConfig := &driver.ExecConfig{
|
|
Cmd: cmd,
|
|
Env: t.Spec.Environment,
|
|
WorkingDir: workingDir,
|
|
User: stepUser(t),
|
|
AttachStdin: true,
|
|
Stdout: stdout,
|
|
Stderr: logf,
|
|
}
|
|
|
|
ce, err := pod.Exec(ctx, execConfig)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
stdin := ce.Stdin()
|
|
go func() {
|
|
_, _ = io.WriteString(stdin, key)
|
|
stdin.Close()
|
|
}()
|
|
|
|
exitCode, err := ce.Wait(ctx)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if exitCode != 0 {
|
|
return "", errors.Errorf("template ended with exit code %d", exitCode)
|
|
}
|
|
|
|
return stdout.String(), nil
|
|
}
|
|
|
|
func (e *Executor) unarchive(ctx context.Context, t *types.ExecutorTask, source io.Reader, pod driver.Pod, logf io.Writer, destDir string, overwrite, removeDestDir bool) error {
|
|
args := []string{"--destdir", destDir}
|
|
if overwrite {
|
|
args = append(args, "--overwrite")
|
|
}
|
|
if removeDestDir {
|
|
args = append(args, "--remove-destdir")
|
|
}
|
|
cmd := append([]string{toolboxContainerPath, "unarchive"}, args...)
|
|
|
|
workingDir, err := e.expandDir(ctx, t, pod, logf, t.Spec.WorkingDir)
|
|
if err != nil {
|
|
_, _ = io.WriteString(logf, fmt.Sprintf("failed to expand working dir %q. Error: %s\n", t.Spec.WorkingDir, err))
|
|
return err
|
|
}
|
|
|
|
execConfig := &driver.ExecConfig{
|
|
Cmd: cmd,
|
|
Env: t.Spec.Environment,
|
|
WorkingDir: workingDir,
|
|
User: stepUser(t),
|
|
AttachStdin: true,
|
|
Stdout: logf,
|
|
Stderr: logf,
|
|
}
|
|
|
|
ce, err := pod.Exec(ctx, execConfig)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
stdin := ce.Stdin()
|
|
go func() {
|
|
_, _ = io.Copy(stdin, source)
|
|
stdin.Close()
|
|
}()
|
|
|
|
exitCode, err := ce.Wait(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if exitCode != 0 {
|
|
return errors.Errorf("unarchive ended with exit code %d", exitCode)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (e *Executor) doRestoreWorkspaceStep(ctx context.Context, s *types.RestoreWorkspaceStep, t *types.ExecutorTask, pod driver.Pod, logPath string) (int, error) {
|
|
if err := os.MkdirAll(filepath.Dir(logPath), 0770); err != nil {
|
|
return -1, err
|
|
}
|
|
logf, err := os.Create(logPath)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
defer logf.Close()
|
|
|
|
for _, op := range t.Spec.WorkspaceOperations {
|
|
log.Debugf("unarchiving workspace for taskID: %s, step: %d", level, op.TaskID, op.Step)
|
|
resp, err := e.runserviceClient.GetArchive(ctx, op.TaskID, op.Step)
|
|
if err != nil {
|
|
// TODO(sgotti) retry before giving up
|
|
fmt.Fprintf(logf, "error reading workspace archive: %v\n", err)
|
|
return -1, err
|
|
}
|
|
archivef := resp.Body
|
|
if err := e.unarchive(ctx, t, archivef, pod, logf, s.DestDir, false, false); err != nil {
|
|
archivef.Close()
|
|
return -1, err
|
|
}
|
|
archivef.Close()
|
|
}
|
|
|
|
return 0, nil
|
|
}
|
|
|
|
func (e *Executor) doSaveCacheStep(ctx context.Context, s *types.SaveCacheStep, t *types.ExecutorTask, pod driver.Pod, logPath string, archivePath string) (int, error) {
|
|
cmd := []string{toolboxContainerPath, "archive"}
|
|
|
|
if err := os.MkdirAll(filepath.Dir(logPath), 0770); err != nil {
|
|
return -1, err
|
|
}
|
|
logf, err := os.Create(logPath)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
defer logf.Close()
|
|
|
|
save := false
|
|
|
|
// calculate key from template
|
|
userKey, err := e.template(ctx, t, pod, logf, s.Key)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
fmt.Fprintf(logf, "cache key %q\n", userKey)
|
|
|
|
// append cache prefix
|
|
key := t.Spec.CachePrefix + "-" + userKey
|
|
|
|
// check that the cache key doesn't already exists
|
|
resp, err := e.runserviceClient.CheckCache(ctx, key, false)
|
|
if err != nil {
|
|
// ignore 404 errors since they means that the cache key doesn't exists
|
|
if resp != nil && resp.StatusCode == http.StatusNotFound {
|
|
fmt.Fprintf(logf, "no cache available for key %q. Saving.\n", userKey)
|
|
save = true
|
|
} else {
|
|
// TODO(sgotti) retry before giving up
|
|
fmt.Fprintf(logf, "error checking for cache key %q: %v\n", userKey, err)
|
|
return -1, err
|
|
}
|
|
}
|
|
if !save {
|
|
fmt.Fprintf(logf, "cache for key %q already exists\n", userKey)
|
|
return 0, nil
|
|
}
|
|
|
|
fmt.Fprintf(logf, "archiving cache with key %q\n", userKey)
|
|
if err := os.MkdirAll(filepath.Dir(archivePath), 0770); err != nil {
|
|
return -1, err
|
|
}
|
|
archivef, err := os.Create(archivePath)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
defer archivef.Close()
|
|
|
|
workingDir, err := e.expandDir(ctx, t, pod, logf, t.Spec.WorkingDir)
|
|
if err != nil {
|
|
_, _ = io.WriteString(logf, fmt.Sprintf("failed to expand working dir %q. Error: %s\n", t.Spec.WorkingDir, err))
|
|
return -1, err
|
|
}
|
|
|
|
execConfig := &driver.ExecConfig{
|
|
Cmd: cmd,
|
|
Env: t.Spec.Environment,
|
|
WorkingDir: workingDir,
|
|
User: stepUser(t),
|
|
AttachStdin: true,
|
|
Stdout: archivef,
|
|
Stderr: logf,
|
|
}
|
|
|
|
ce, err := pod.Exec(ctx, execConfig)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
type ArchiveInfo struct {
|
|
SourceDir string
|
|
DestDir string
|
|
Paths []string
|
|
}
|
|
type Archive struct {
|
|
ArchiveInfos []*ArchiveInfo
|
|
OutFile string
|
|
}
|
|
|
|
a := &Archive{
|
|
OutFile: "", // use stdout
|
|
ArchiveInfos: make([]*ArchiveInfo, len(s.Contents)),
|
|
}
|
|
|
|
for i, c := range s.Contents {
|
|
a.ArchiveInfos[i] = &ArchiveInfo{
|
|
SourceDir: c.SourceDir,
|
|
DestDir: c.DestDir,
|
|
Paths: c.Paths,
|
|
}
|
|
|
|
}
|
|
|
|
stdin := ce.Stdin()
|
|
enc := json.NewEncoder(stdin)
|
|
|
|
go func() {
|
|
_ = enc.Encode(a)
|
|
stdin.Close()
|
|
}()
|
|
|
|
exitCode, err := ce.Wait(ctx)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
if exitCode != 0 {
|
|
return exitCode, errors.Errorf("save cache archiving command ended with exit code %d", exitCode)
|
|
}
|
|
|
|
f, err := os.Open(archivePath)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
fi, err := f.Stat()
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
// send cache archive to scheduler
|
|
if resp, err := e.runserviceClient.PutCache(ctx, key, fi.Size(), f); err != nil {
|
|
if resp != nil && resp.StatusCode == http.StatusNotModified {
|
|
return exitCode, nil
|
|
}
|
|
return -1, err
|
|
}
|
|
|
|
return exitCode, nil
|
|
}
|
|
|
|
func (e *Executor) doRestoreCacheStep(ctx context.Context, s *types.RestoreCacheStep, t *types.ExecutorTask, pod driver.Pod, logPath string) (int, error) {
|
|
if err := os.MkdirAll(filepath.Dir(logPath), 0770); err != nil {
|
|
return -1, err
|
|
}
|
|
logf, err := os.Create(logPath)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
defer logf.Close()
|
|
|
|
fmt.Fprintf(logf, "restoring cache: %s\n", util.Dump(s))
|
|
for _, key := range s.Keys {
|
|
// calculate key from template
|
|
userKey, err := e.template(ctx, t, pod, logf, key)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
fmt.Fprintf(logf, "cache key %q\n", userKey)
|
|
|
|
// append cache prefix
|
|
key := t.Spec.CachePrefix + "-" + userKey
|
|
|
|
resp, err := e.runserviceClient.GetCache(ctx, key, true)
|
|
if err != nil {
|
|
// ignore 404 errors since they means that the cache key doesn't exists
|
|
if resp != nil && resp.StatusCode == http.StatusNotFound {
|
|
fmt.Fprintf(logf, "no cache available for key %q\n", userKey)
|
|
continue
|
|
}
|
|
// TODO(sgotti) retry before giving up
|
|
fmt.Fprintf(logf, "error reading cache: %v\n", err)
|
|
return -1, err
|
|
}
|
|
fmt.Fprintf(logf, "restoring cache with key %q\n", userKey)
|
|
cachef := resp.Body
|
|
if err := e.unarchive(ctx, t, cachef, pod, logf, s.DestDir, false, false); err != nil {
|
|
cachef.Close()
|
|
return -1, err
|
|
}
|
|
cachef.Close()
|
|
|
|
// stop here
|
|
break
|
|
}
|
|
|
|
return 0, nil
|
|
}
|
|
|
|
func (e *Executor) executorIDPath() string {
|
|
return filepath.Join(e.c.DataDir, "id")
|
|
}
|
|
|
|
func (e *Executor) tasksDir() string {
|
|
return filepath.Join(e.c.DataDir, "tasks")
|
|
}
|
|
|
|
func (e *Executor) taskPath(taskID string) string {
|
|
return filepath.Join(e.tasksDir(), taskID)
|
|
}
|
|
|
|
func (e *Executor) taskLogsPath(taskID string) string {
|
|
return filepath.Join(e.tasksDir(), taskID, "logs")
|
|
}
|
|
|
|
func (e *Executor) setupLogPath(taskID string) string {
|
|
return filepath.Join(e.taskLogsPath(taskID), "setup.log")
|
|
}
|
|
|
|
func (e *Executor) stepLogPath(taskID string, stepID int) string {
|
|
return filepath.Join(e.taskLogsPath(taskID), "steps", fmt.Sprintf("%d.log", stepID))
|
|
}
|
|
|
|
func (e *Executor) archivePath(taskID string, stepID int) string {
|
|
return filepath.Join(e.taskPath(taskID), "archives", fmt.Sprintf("%d.tar", stepID))
|
|
}
|
|
|
|
func (e *Executor) sendExecutorStatus(ctx context.Context) error {
|
|
labels := e.c.Labels
|
|
if labels == nil {
|
|
labels = make(map[string]string)
|
|
}
|
|
|
|
activeTasks := e.runningTasks.len()
|
|
|
|
archs, err := e.driver.Archs(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
executorGroup, err := e.driver.ExecutorGroup(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// report all the executors that are active OR that have some owned pods not yet removed
|
|
activeExecutors, err := e.driver.GetExecutors(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
pods, err := e.driver.GetPods(ctx, true)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
executorsMap := map[string]struct{}{}
|
|
for _, executorID := range activeExecutors {
|
|
executorsMap[executorID] = struct{}{}
|
|
}
|
|
for _, pod := range pods {
|
|
executorsMap[pod.ExecutorID()] = struct{}{}
|
|
}
|
|
siblingsExecutors := []string{}
|
|
for executorID := range executorsMap {
|
|
siblingsExecutors = append(siblingsExecutors, executorID)
|
|
}
|
|
|
|
executor := &types.Executor{
|
|
ID: e.id,
|
|
Archs: archs,
|
|
AllowPrivilegedContainers: e.c.AllowPrivilegedContainers,
|
|
ListenURL: e.listenURL,
|
|
Labels: labels,
|
|
ActiveTasksLimit: e.c.ActiveTasksLimit,
|
|
ActiveTasks: activeTasks,
|
|
Dynamic: e.dynamic,
|
|
ExecutorGroup: executorGroup,
|
|
SiblingsExecutors: siblingsExecutors,
|
|
}
|
|
|
|
log.Debugf("send executor status: %s", util.Dump(executor))
|
|
_, err = e.runserviceClient.SendExecutorStatus(ctx, executor)
|
|
return err
|
|
}
|
|
|
|
func (e *Executor) sendExecutorTaskStatus(ctx context.Context, et *types.ExecutorTask) error {
|
|
log.Debugf("send executor task: %s. status: %s", et.ID, et.Status.Phase)
|
|
_, err := e.runserviceClient.SendExecutorTaskStatus(ctx, e.id, et)
|
|
return err
|
|
}
|
|
|
|
func (e *Executor) executeTask(ctx context.Context, rt *runningTask) {
|
|
// * save in local state that we have a running task
|
|
// * start the pod
|
|
// * then update the executortask status to in-progress
|
|
// if something fails pod will be cleaned up by the pod cleaner goroutine
|
|
// In this way we are sure that the pod cleaner will only remove pod that don't
|
|
// have an in progress running task
|
|
|
|
rt.Lock()
|
|
|
|
defer func() {
|
|
rt.Lock()
|
|
rt.executing = false
|
|
rt.Unlock()
|
|
}()
|
|
|
|
et := rt.et
|
|
|
|
et.Status.Phase = types.ExecutorTaskPhaseRunning
|
|
et.Status.StartTime = util.TimeP(time.Now())
|
|
et.Status.SetupStep.Phase = types.ExecutorTaskPhaseRunning
|
|
et.Status.SetupStep.StartTime = util.TimeP(time.Now())
|
|
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
|
|
if err := e.setupTask(ctx, rt); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
et.Status.Phase = types.ExecutorTaskPhaseFailed
|
|
et.Status.EndTime = util.TimeP(time.Now())
|
|
et.Status.SetupStep.Phase = types.ExecutorTaskPhaseFailed
|
|
et.Status.SetupStep.EndTime = util.TimeP(time.Now())
|
|
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
rt.Unlock()
|
|
return
|
|
}
|
|
|
|
et.Status.SetupStep.Phase = types.ExecutorTaskPhaseSuccess
|
|
et.Status.SetupStep.EndTime = util.TimeP(time.Now())
|
|
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
|
|
rt.Unlock()
|
|
|
|
_, err := e.executeTaskSteps(ctx, rt, rt.pod)
|
|
|
|
rt.Lock()
|
|
if err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
if rt.et.Spec.Stop {
|
|
et.Status.Phase = types.ExecutorTaskPhaseStopped
|
|
} else {
|
|
et.Status.Phase = types.ExecutorTaskPhaseFailed
|
|
}
|
|
} else {
|
|
et.Status.Phase = types.ExecutorTaskPhaseSuccess
|
|
}
|
|
|
|
et.Status.EndTime = util.TimeP(time.Now())
|
|
|
|
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
rt.Unlock()
|
|
}
|
|
|
|
func (e *Executor) setupTask(ctx context.Context, rt *runningTask) error {
|
|
et := rt.et
|
|
if err := os.RemoveAll(e.taskPath(et.ID)); err != nil {
|
|
return err
|
|
}
|
|
if err := os.MkdirAll(e.taskPath(et.ID), 0770); err != nil {
|
|
return err
|
|
}
|
|
|
|
setupLogPath := e.setupLogPath(et.ID)
|
|
if err := os.MkdirAll(filepath.Dir(setupLogPath), 0770); err != nil {
|
|
return err
|
|
}
|
|
outf, err := os.Create(setupLogPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer outf.Close()
|
|
|
|
// error out if privileged containers are required but not allowed
|
|
requiresPrivilegedContainers := false
|
|
for _, c := range et.Spec.Containers {
|
|
if c.Privileged {
|
|
requiresPrivilegedContainers = true
|
|
break
|
|
}
|
|
}
|
|
if requiresPrivilegedContainers && !e.c.AllowPrivilegedContainers {
|
|
_, _ = outf.WriteString("Executor doesn't allow executing privileged containers.\n")
|
|
return errors.Errorf("executor doesn't allow executing privileged containers")
|
|
}
|
|
|
|
log.Debugf("starting pod")
|
|
|
|
dockerConfig, err := registry.GenDockerConfig(et.Spec.DockerRegistriesAuth, []string{et.Spec.Containers[0].Image})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
podConfig := &driver.PodConfig{
|
|
// generate a random pod id (don't use task id for future ability to restart
|
|
// tasks failed to start and don't clash with existing pods)
|
|
ID: uuid.NewV4().String(),
|
|
TaskID: et.ID,
|
|
Arch: et.Spec.Arch,
|
|
InitVolumeDir: toolboxContainerDir,
|
|
DockerConfig: dockerConfig,
|
|
Containers: make([]*driver.ContainerConfig, len(et.Spec.Containers)),
|
|
}
|
|
for i, c := range et.Spec.Containers {
|
|
var cmd []string
|
|
if i == 0 {
|
|
cmd = []string{toolboxContainerPath, "sleeper"}
|
|
}
|
|
if c.Entrypoint != "" {
|
|
cmd = strings.Split(c.Entrypoint, " ")
|
|
}
|
|
|
|
containerConfig := &driver.ContainerConfig{
|
|
Image: c.Image,
|
|
Cmd: cmd,
|
|
Env: c.Environment,
|
|
User: c.User,
|
|
Privileged: c.Privileged,
|
|
Volumes: make([]driver.Volume, len(c.Volumes)),
|
|
}
|
|
|
|
for vIndex, cVol := range c.Volumes {
|
|
containerConfig.Volumes[vIndex] = driver.Volume{
|
|
Path: cVol.Path,
|
|
}
|
|
if cVol.TmpFS != nil {
|
|
containerConfig.Volumes[vIndex].TmpFS = &driver.VolumeTmpFS{
|
|
Size: cVol.TmpFS.Size,
|
|
}
|
|
}
|
|
}
|
|
|
|
podConfig.Containers[i] = containerConfig
|
|
}
|
|
|
|
_, _ = outf.WriteString("Starting pod.\n")
|
|
pod, err := e.driver.NewPod(ctx, podConfig, outf)
|
|
if err != nil {
|
|
_, _ = outf.WriteString(fmt.Sprintf("Pod failed to start. Error: %s\n", err))
|
|
return err
|
|
}
|
|
_, _ = outf.WriteString("Pod started.\n")
|
|
|
|
if et.Spec.WorkingDir != "" {
|
|
_, _ = outf.WriteString(fmt.Sprintf("Creating working dir %q.\n", et.Spec.WorkingDir))
|
|
if err := e.mkdir(ctx, et, pod, outf, et.Spec.WorkingDir); err != nil {
|
|
_, _ = outf.WriteString(fmt.Sprintf("Failed to create working dir %q. Error: %s\n", et.Spec.WorkingDir, err))
|
|
return err
|
|
}
|
|
}
|
|
|
|
rt.pod = pod
|
|
return nil
|
|
}
|
|
|
|
func (e *Executor) executeTaskSteps(ctx context.Context, rt *runningTask, pod driver.Pod) (int, error) {
|
|
for i, step := range rt.et.Spec.Steps {
|
|
rt.Lock()
|
|
rt.et.Status.Steps[i].Phase = types.ExecutorTaskPhaseRunning
|
|
rt.et.Status.Steps[i].StartTime = util.TimeP(time.Now())
|
|
if err := e.sendExecutorTaskStatus(ctx, rt.et); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
rt.Unlock()
|
|
|
|
var err error
|
|
var exitCode int
|
|
var stepName string
|
|
|
|
switch s := step.(type) {
|
|
case *types.RunStep:
|
|
log.Debugf("run step: %s", util.Dump(s))
|
|
stepName = s.Name
|
|
exitCode, err = e.doRunStep(ctx, s, rt.et, pod, e.stepLogPath(rt.et.ID, i))
|
|
|
|
case *types.SaveToWorkspaceStep:
|
|
log.Debugf("save to workspace step: %s", util.Dump(s))
|
|
stepName = s.Name
|
|
archivePath := e.archivePath(rt.et.ID, i)
|
|
exitCode, err = e.doSaveToWorkspaceStep(ctx, s, rt.et, pod, e.stepLogPath(rt.et.ID, i), archivePath)
|
|
|
|
case *types.RestoreWorkspaceStep:
|
|
log.Debugf("restore workspace step: %s", util.Dump(s))
|
|
stepName = s.Name
|
|
exitCode, err = e.doRestoreWorkspaceStep(ctx, s, rt.et, pod, e.stepLogPath(rt.et.ID, i))
|
|
|
|
case *types.SaveCacheStep:
|
|
log.Debugf("save cache step: %s", util.Dump(s))
|
|
stepName = s.Name
|
|
archivePath := e.archivePath(rt.et.ID, i)
|
|
exitCode, err = e.doSaveCacheStep(ctx, s, rt.et, pod, e.stepLogPath(rt.et.ID, i), archivePath)
|
|
|
|
case *types.RestoreCacheStep:
|
|
log.Debugf("restore cache step: %s", util.Dump(s))
|
|
stepName = s.Name
|
|
exitCode, err = e.doRestoreCacheStep(ctx, s, rt.et, pod, e.stepLogPath(rt.et.ID, i))
|
|
|
|
default:
|
|
return i, errors.Errorf("unknown step type: %s", util.Dump(s))
|
|
}
|
|
|
|
var serr error
|
|
|
|
rt.Lock()
|
|
rt.et.Status.Steps[i].EndTime = util.TimeP(time.Now())
|
|
|
|
rt.et.Status.Steps[i].Phase = types.ExecutorTaskPhaseSuccess
|
|
|
|
if err != nil {
|
|
if rt.et.Spec.Stop {
|
|
rt.et.Status.Steps[i].Phase = types.ExecutorTaskPhaseStopped
|
|
} else {
|
|
rt.et.Status.Steps[i].Phase = types.ExecutorTaskPhaseFailed
|
|
}
|
|
serr = errors.Errorf("failed to execute step %s: %w", util.Dump(step), err)
|
|
} else if exitCode != 0 {
|
|
if rt.et.Spec.Stop {
|
|
rt.et.Status.Steps[i].Phase = types.ExecutorTaskPhaseStopped
|
|
} else {
|
|
rt.et.Status.Steps[i].Phase = types.ExecutorTaskPhaseFailed
|
|
}
|
|
rt.et.Status.Steps[i].ExitStatus = util.IntP(exitCode)
|
|
serr = errors.Errorf("step %q failed with exitcode %d", stepName, exitCode)
|
|
} else if exitCode == 0 {
|
|
rt.et.Status.Steps[i].ExitStatus = util.IntP(exitCode)
|
|
}
|
|
|
|
if err := e.sendExecutorTaskStatus(ctx, rt.et); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
rt.Unlock()
|
|
|
|
if serr != nil {
|
|
return i, serr
|
|
}
|
|
}
|
|
|
|
return 0, nil
|
|
}
|
|
|
|
func (e *Executor) podsCleanerLoop(ctx context.Context) {
|
|
for {
|
|
log.Debugf("podsCleaner")
|
|
|
|
if err := e.podsCleaner(ctx); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
|
|
sleepCh := time.NewTimer(1 * time.Second).C
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-sleepCh:
|
|
}
|
|
}
|
|
}
|
|
|
|
func (e *Executor) podsCleaner(ctx context.Context) error {
|
|
pods, err := e.getAllPods(ctx, true)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
executors, err := e.driver.GetExecutors(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// always add ourself to executors
|
|
executors = append(executors, e.id)
|
|
|
|
for _, pod := range pods {
|
|
taskID := pod.TaskID()
|
|
// clean our owned pods
|
|
if pod.ExecutorID() == e.id {
|
|
if _, ok := e.runningTasks.get(taskID); !ok {
|
|
log.Infof("removing pod %s for not running task: %s", pod.ID(), taskID)
|
|
_ = pod.Remove(ctx)
|
|
}
|
|
}
|
|
|
|
// if no executor owns the pod we'll delete it
|
|
owned := false
|
|
for _, executorID := range executors {
|
|
if pod.ExecutorID() == executorID {
|
|
owned = true
|
|
break
|
|
}
|
|
}
|
|
if !owned {
|
|
log.Infof("removing pod %s since it's not owned by any active executor", pod.ID())
|
|
_ = pod.Remove(ctx)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (e *Executor) executorStatusSenderLoop(ctx context.Context) {
|
|
for {
|
|
log.Debugf("executorStatusSenderLoop")
|
|
|
|
if err := e.sendExecutorStatus(ctx); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
|
|
sleepCh := time.NewTimer(2 * time.Second).C
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-sleepCh:
|
|
}
|
|
}
|
|
}
|
|
|
|
func (e *Executor) executorTasksStatusSenderLoop(ctx context.Context) {
|
|
for {
|
|
log.Debugf("executorTasksStatusSenderLoop")
|
|
|
|
for _, rtID := range e.runningTasks.ids() {
|
|
rt, ok := e.runningTasks.get(rtID)
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
rt.Lock()
|
|
if err := e.sendExecutorTaskStatus(ctx, rt.et); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
rt.Unlock()
|
|
continue
|
|
}
|
|
|
|
// remove running task if send was successful and it's not executing
|
|
if !rt.executing {
|
|
e.runningTasks.delete(rtID)
|
|
}
|
|
rt.Unlock()
|
|
}
|
|
|
|
sleepCh := time.NewTimer(2 * time.Second).C
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-sleepCh:
|
|
}
|
|
}
|
|
}
|
|
|
|
func (e *Executor) tasksUpdaterLoop(ctx context.Context) {
|
|
for {
|
|
log.Debugf("tasksUpdater")
|
|
|
|
if err := e.tasksUpdater(ctx); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
|
|
sleepCh := time.NewTimer(2 * time.Second).C
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-sleepCh:
|
|
}
|
|
}
|
|
}
|
|
|
|
// taskUpdater fetches the executor tasks from the scheduler and handles them
|
|
// this is useful to catch up when some tasks submissions from the scheduler to the executor
|
|
// APIs fails
|
|
func (e *Executor) tasksUpdater(ctx context.Context) error {
|
|
ets, _, err := e.runserviceClient.GetExecutorTasks(ctx, e.id)
|
|
if err != nil {
|
|
log.Warnf("err: %v", err)
|
|
return err
|
|
}
|
|
log.Debugf("ets: %v", util.Dump(ets))
|
|
for _, et := range ets {
|
|
e.taskUpdater(ctx, et)
|
|
}
|
|
|
|
// remove runningTasks not existing in the runservice
|
|
etIDsMap := map[string]struct{}{}
|
|
for _, et := range ets {
|
|
etIDsMap[et.ID] = struct{}{}
|
|
}
|
|
|
|
for _, rtID := range e.runningTasks.ids() {
|
|
if _, ok := etIDsMap[rtID]; !ok {
|
|
e.runningTasks.delete(rtID)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (e *Executor) taskUpdater(ctx context.Context, et *types.ExecutorTask) {
|
|
log.Debugf("et: %v", util.Dump(et))
|
|
if et.Spec.ExecutorID != e.id {
|
|
return
|
|
}
|
|
|
|
rt, _ := e.runningTasks.get(et.ID)
|
|
if rt != nil {
|
|
rt.Lock()
|
|
// update running task Spec.Stop value only when there's a transitions from false to true,
|
|
// other spec values cannot change once the task has been scheduled
|
|
if !rt.et.Spec.Stop && et.Spec.Stop {
|
|
rt.et.Spec.Stop = et.Spec.Stop
|
|
|
|
if !rt.et.Status.Phase.IsFinished() && rt.pod != nil {
|
|
if err := rt.pod.Stop(ctx); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
}
|
|
}
|
|
rt.Unlock()
|
|
|
|
return
|
|
}
|
|
|
|
// rt == nil
|
|
|
|
// only send cancelled phase when the executor task isn't in running tasks and is not started
|
|
if et.Spec.Stop && et.Status.Phase == types.ExecutorTaskPhaseNotStarted {
|
|
et.Status.Phase = types.ExecutorTaskPhaseCancelled
|
|
go func() {
|
|
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
}()
|
|
}
|
|
|
|
if et.Status.Phase == types.ExecutorTaskPhaseRunning {
|
|
log.Infof("marking executor task %s as failed since there's no running task", et.ID)
|
|
et.Status.Phase = types.ExecutorTaskPhaseFailed
|
|
et.Status.EndTime = util.TimeP(time.Now())
|
|
// mark in progress step as failed too
|
|
for _, s := range et.Status.Steps {
|
|
if s.Phase == types.ExecutorTaskPhaseRunning {
|
|
s.Phase = types.ExecutorTaskPhaseFailed
|
|
s.EndTime = util.TimeP(time.Now())
|
|
}
|
|
}
|
|
go func() {
|
|
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
}()
|
|
}
|
|
|
|
if !et.Spec.Stop && et.Status.Phase == types.ExecutorTaskPhaseNotStarted {
|
|
activeTasks := e.runningTasks.len()
|
|
// don't start task if we have reached the active tasks limit (they will be retried
|
|
// on next taskUpdater calls)
|
|
if activeTasks > e.c.ActiveTasksLimit {
|
|
return
|
|
}
|
|
rt := &runningTask{
|
|
et: et,
|
|
executing: true,
|
|
}
|
|
|
|
if !e.runningTasks.addIfNotExists(et.ID, rt) {
|
|
log.Warnf("task %s already running, this shouldn't happen", et.ID)
|
|
return
|
|
}
|
|
|
|
go e.executeTask(ctx, rt)
|
|
}
|
|
}
|
|
|
|
func (e *Executor) tasksDataCleanerLoop(ctx context.Context) {
|
|
for {
|
|
log.Debugf("tasksDataCleaner")
|
|
|
|
if err := e.tasksDataCleaner(ctx); err != nil {
|
|
log.Errorf("err: %+v", err)
|
|
}
|
|
|
|
sleepCh := time.NewTimer(2 * time.Second).C
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-sleepCh:
|
|
}
|
|
}
|
|
}
|
|
|
|
func (e *Executor) tasksDataCleaner(ctx context.Context) error {
|
|
entries, err := ioutil.ReadDir(e.tasksDir())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, entry := range entries {
|
|
if !entry.IsDir() {
|
|
continue
|
|
}
|
|
etID := filepath.Base(entry.Name())
|
|
|
|
_, resp, err := e.runserviceClient.GetExecutorTask(ctx, e.id, etID)
|
|
if err != nil {
|
|
if resp == nil {
|
|
return err
|
|
}
|
|
if resp.StatusCode != http.StatusNotFound {
|
|
return err
|
|
}
|
|
}
|
|
if resp.StatusCode == http.StatusNotFound {
|
|
taskDir := e.taskPath(etID)
|
|
log.Infof("removing task dir %q", taskDir)
|
|
// remove task dir
|
|
if err := os.RemoveAll(taskDir); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
type runningTasks struct {
|
|
tasks map[string]*runningTask
|
|
m sync.Mutex
|
|
}
|
|
|
|
type runningTask struct {
|
|
sync.Mutex
|
|
|
|
et *types.ExecutorTask
|
|
pod driver.Pod
|
|
|
|
executing bool
|
|
}
|
|
|
|
func (r *runningTasks) get(rtID string) (*runningTask, bool) {
|
|
r.m.Lock()
|
|
defer r.m.Unlock()
|
|
rt, ok := r.tasks[rtID]
|
|
return rt, ok
|
|
}
|
|
|
|
func (r *runningTasks) addIfNotExists(rtID string, rt *runningTask) bool {
|
|
r.m.Lock()
|
|
defer r.m.Unlock()
|
|
if _, ok := r.tasks[rtID]; ok {
|
|
return false
|
|
}
|
|
r.tasks[rtID] = rt
|
|
return true
|
|
}
|
|
|
|
func (r *runningTasks) delete(rtID string) {
|
|
r.m.Lock()
|
|
defer r.m.Unlock()
|
|
delete(r.tasks, rtID)
|
|
}
|
|
|
|
func (r *runningTasks) len() int {
|
|
r.m.Lock()
|
|
defer r.m.Unlock()
|
|
return len(r.tasks)
|
|
}
|
|
|
|
func (r *runningTasks) ids() []string {
|
|
ids := []string{}
|
|
r.m.Lock()
|
|
defer r.m.Unlock()
|
|
for id := range r.tasks {
|
|
ids = append(ids, id)
|
|
}
|
|
return ids
|
|
}
|
|
|
|
func (e *Executor) handleTasks(ctx context.Context, c <-chan *types.ExecutorTask) {
|
|
for et := range c {
|
|
e.taskUpdater(ctx, et)
|
|
}
|
|
}
|
|
|
|
func (e *Executor) getExecutorID() (string, error) {
|
|
id, err := ioutil.ReadFile(e.executorIDPath())
|
|
if err != nil && !os.IsNotExist(err) {
|
|
return "", err
|
|
}
|
|
return string(id), nil
|
|
}
|
|
|
|
func (e *Executor) saveExecutorID(id string) error {
|
|
if err := common.WriteFileAtomic(e.executorIDPath(), []byte(id), 0660); err != nil {
|
|
return errors.Errorf("failed to write executor id file: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type Executor struct {
|
|
c *config.Executor
|
|
runserviceClient *rsclient.Client
|
|
id string
|
|
runningTasks *runningTasks
|
|
driver driver.Driver
|
|
listenAddress string
|
|
listenURL string
|
|
dynamic bool
|
|
}
|
|
|
|
func NewExecutor(ctx context.Context, l *zap.Logger, c *config.Executor) (*Executor, error) {
|
|
if l != nil {
|
|
logger = l
|
|
}
|
|
if c.Debug {
|
|
level.SetLevel(zapcore.DebugLevel)
|
|
}
|
|
log = logger.Sugar()
|
|
|
|
var err error
|
|
c.ToolboxPath, err = filepath.Abs(c.ToolboxPath)
|
|
if err != nil {
|
|
return nil, errors.Errorf("cannot determine \"agola-toolbox\" absolute path: %w", err)
|
|
}
|
|
|
|
e := &Executor{
|
|
c: c,
|
|
runserviceClient: rsclient.NewClient(c.RunserviceURL),
|
|
runningTasks: &runningTasks{
|
|
tasks: make(map[string]*runningTask),
|
|
},
|
|
}
|
|
|
|
if err := os.MkdirAll(e.tasksDir(), 0770); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
id, err := e.getExecutorID()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if id == "" {
|
|
id = uuid.NewV4().String()
|
|
if err := e.saveExecutorID(id); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
e.id = id
|
|
|
|
// TODO(sgotti) now the first available private ip will be used and the executor will bind to the wildcard address
|
|
// improve this to let the user define the bind and the advertize address
|
|
addr, err := sockaddr.GetPrivateIP()
|
|
if err != nil {
|
|
return nil, errors.Errorf("cannot discover executor listen address: %w", err)
|
|
}
|
|
if addr == "" {
|
|
return nil, errors.Errorf("cannot discover executor listen address")
|
|
}
|
|
u := url.URL{Scheme: "http"}
|
|
if c.Web.TLS {
|
|
u.Scheme = "https"
|
|
}
|
|
_, port, err := net.SplitHostPort(c.Web.ListenAddress)
|
|
if err != nil {
|
|
return nil, errors.Errorf("cannot get web listen port: %w", err)
|
|
}
|
|
u.Host = net.JoinHostPort(addr, port)
|
|
e.listenURL = u.String()
|
|
|
|
e.listenAddress = fmt.Sprintf(":%s", port)
|
|
|
|
var d driver.Driver
|
|
switch c.Driver.Type {
|
|
case config.DriverTypeDocker:
|
|
d, err = driver.NewDockerDriver(logger, e.id, e.c.ToolboxPath)
|
|
if err != nil {
|
|
return nil, errors.Errorf("failed to create docker driver: %w", err)
|
|
}
|
|
case config.DriverTypeK8s:
|
|
d, err = driver.NewK8sDriver(logger, e.id, c.ToolboxPath)
|
|
if err != nil {
|
|
return nil, errors.Errorf("failed to create kubernetes driver: %w", err)
|
|
}
|
|
e.dynamic = true
|
|
default:
|
|
return nil, errors.Errorf("unknown driver type %q", c.Driver.Type)
|
|
}
|
|
e.driver = d
|
|
|
|
return e, nil
|
|
}
|
|
|
|
func (e *Executor) Run(ctx context.Context) error {
|
|
if err := e.driver.Setup(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
ch := make(chan *types.ExecutorTask)
|
|
schedulerHandler := NewTaskSubmissionHandler(ch)
|
|
logsHandler := NewLogsHandler(logger, e)
|
|
archivesHandler := NewArchivesHandler(e)
|
|
|
|
router := mux.NewRouter()
|
|
apirouter := router.PathPrefix("/api/v1alpha").Subrouter()
|
|
|
|
apirouter.Handle("/executor", schedulerHandler).Methods("POST")
|
|
apirouter.Handle("/executor/logs", logsHandler).Methods("GET")
|
|
apirouter.Handle("/executor/archives", archivesHandler).Methods("GET")
|
|
|
|
go e.executorStatusSenderLoop(ctx)
|
|
go e.executorTasksStatusSenderLoop(ctx)
|
|
go e.podsCleanerLoop(ctx)
|
|
go e.tasksUpdaterLoop(ctx)
|
|
go e.tasksDataCleanerLoop(ctx)
|
|
|
|
go e.handleTasks(ctx, ch)
|
|
|
|
httpServer := http.Server{
|
|
Addr: e.listenAddress,
|
|
Handler: apirouter,
|
|
}
|
|
lerrCh := make(chan error)
|
|
go func() {
|
|
lerrCh <- httpServer.ListenAndServe()
|
|
}()
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
log.Infof("runservice executor exiting")
|
|
httpServer.Close()
|
|
case err := <-lerrCh:
|
|
if err != nil {
|
|
log.Errorf("http server listen error: %v", err)
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|