agola/internal/services/executor/executor.go
Simone Gotti ecf355721f docker: create a toolbox volume for every pod
Instead of doing the current hack of copying the agola toolbox inside the host
tmp dir (always done but only needed when running the executor inside a docker
container) that has different issues (like tmp file removal done by
tmpwatch/systemd-tmpfiles), use a solution similar to the k8s driver: for every
pod create a volume containing the agola-toolbox and remove it at pod removal.

We could also use a single "global" volume but we should handle cases like
volume removal (i.e. a docker volume prune command). So for now just create a
dedicated per pod volume.
2020-01-10 12:25:12 +01:00

1468 lines
35 KiB
Go

// Copyright 2019 Sorint.lab
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied
// See the License for the specific language governing permissions and
// limitations under the License.
package executor
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"sync"
"time"
"agola.io/agola/internal/common"
slog "agola.io/agola/internal/log"
"agola.io/agola/internal/services/config"
"agola.io/agola/internal/services/executor/driver"
"agola.io/agola/internal/services/executor/registry"
"agola.io/agola/internal/util"
rsclient "agola.io/agola/services/runservice/client"
"agola.io/agola/services/runservice/types"
uuid "github.com/satori/go.uuid"
"github.com/gorilla/mux"
sockaddr "github.com/hashicorp/go-sockaddr"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
errors "golang.org/x/xerrors"
)
var level = zap.NewAtomicLevelAt(zapcore.InfoLevel)
var logger = slog.New(level)
var log = logger.Sugar()
const (
defaultShell = "/bin/sh -e"
toolboxContainerDir = "/mnt/agola"
)
var (
toolboxContainerPath = filepath.Join(toolboxContainerDir, "/agola-toolbox")
)
func (e *Executor) getAllPods(ctx context.Context, all bool) ([]driver.Pod, error) {
return e.driver.GetPods(ctx, all)
}
func stepUser(t *types.ExecutorTask) string {
// use the container specified user and override with task user if defined
user := t.Spec.Containers[0].User
if t.Spec.User != "" {
user = t.Spec.User
}
return user
}
func (e *Executor) createFile(ctx context.Context, pod driver.Pod, command, user string, outf io.Writer) (string, error) {
cmd := []string{toolboxContainerPath, "createfile"}
var buf bytes.Buffer
execConfig := &driver.ExecConfig{
Cmd: cmd,
User: user,
AttachStdin: true,
Stdout: &buf,
Stderr: outf,
}
ce, err := pod.Exec(ctx, execConfig)
if err != nil {
return "", err
}
stdin := ce.Stdin()
go func() {
_, _ = io.WriteString(stdin, command+"\n")
stdin.Close()
}()
exitCode, err := ce.Wait(ctx)
if err != nil {
return "", err
}
if exitCode != 0 {
return "", errors.Errorf("toolbox exited with code: %d", exitCode)
}
return buf.String(), nil
}
func (e *Executor) doRunStep(ctx context.Context, s *types.RunStep, t *types.ExecutorTask, pod driver.Pod, logPath string) (int, error) {
if err := os.MkdirAll(filepath.Dir(logPath), 0770); err != nil {
return -1, err
}
outf, err := os.Create(logPath)
if err != nil {
return -1, err
}
defer outf.Close()
// TODO(sgotti) this line is used only for old runconfig versions that don't
// set a task default shell in the runconfig
shell := defaultShell
if t.Spec.Shell != "" {
shell = t.Spec.Shell
}
if s.Shell != "" {
shell = s.Shell
}
var cmd []string
if s.Command != "" {
filename, err := e.createFile(ctx, pod, s.Command, stepUser(t), outf)
if err != nil {
return -1, errors.Errorf("create file err: %v", err)
}
args := strings.Split(shell, " ")
cmd = append(args, filename)
} else {
cmd = strings.Split(shell, " ")
}
// override task working dir with runstep working dir if provided
workingDir := t.Spec.WorkingDir
if s.WorkingDir != "" {
workingDir = s.WorkingDir
}
// generate the environment using the task environment and then overriding with the runstep environment
environment := map[string]string{}
for envName, envValue := range t.Spec.Environment {
environment[envName] = envValue
}
for envName, envValue := range s.Environment {
environment[envName] = envValue
}
workingDir, err = e.expandDir(ctx, t, pod, outf, workingDir)
if err != nil {
_, _ = outf.WriteString(fmt.Sprintf("failed to expand working dir %q. Error: %s\n", workingDir, err))
return -1, err
}
execConfig := &driver.ExecConfig{
Cmd: cmd,
Env: environment,
WorkingDir: workingDir,
User: stepUser(t),
AttachStdin: true,
Stdout: outf,
Stderr: outf,
Tty: *s.Tty,
}
ce, err := pod.Exec(ctx, execConfig)
if err != nil {
return -1, err
}
exitCode, err := ce.Wait(ctx)
if err != nil {
return -1, err
}
return exitCode, nil
}
func (e *Executor) doSaveToWorkspaceStep(ctx context.Context, s *types.SaveToWorkspaceStep, t *types.ExecutorTask, pod driver.Pod, logPath string, archivePath string) (int, error) {
cmd := []string{toolboxContainerPath, "archive"}
if err := os.MkdirAll(filepath.Dir(logPath), 0770); err != nil {
return -1, err
}
logf, err := os.Create(logPath)
if err != nil {
return -1, err
}
defer logf.Close()
if err := os.MkdirAll(filepath.Dir(archivePath), 0770); err != nil {
return -1, err
}
archivef, err := os.Create(archivePath)
if err != nil {
return -1, err
}
defer archivef.Close()
workingDir, err := e.expandDir(ctx, t, pod, logf, t.Spec.WorkingDir)
if err != nil {
_, _ = logf.WriteString(fmt.Sprintf("failed to expand working dir %q. Error: %s\n", t.Spec.WorkingDir, err))
return -1, err
}
execConfig := &driver.ExecConfig{
Cmd: cmd,
Env: t.Spec.Environment,
WorkingDir: workingDir,
User: stepUser(t),
AttachStdin: true,
Stdout: archivef,
Stderr: logf,
}
ce, err := pod.Exec(ctx, execConfig)
if err != nil {
return -1, err
}
type ArchiveInfo struct {
SourceDir string
DestDir string
Paths []string
}
type Archive struct {
ArchiveInfos []*ArchiveInfo
OutFile string
}
a := &Archive{
OutFile: "", // use stdout
ArchiveInfos: make([]*ArchiveInfo, len(s.Contents)),
}
for i, c := range s.Contents {
a.ArchiveInfos[i] = &ArchiveInfo{
SourceDir: c.SourceDir,
DestDir: c.DestDir,
Paths: c.Paths,
}
}
stdin := ce.Stdin()
enc := json.NewEncoder(stdin)
go func() {
_ = enc.Encode(a)
stdin.Close()
}()
exitCode, err := ce.Wait(ctx)
if err != nil {
return -1, err
}
return exitCode, nil
}
func (e *Executor) expandDir(ctx context.Context, t *types.ExecutorTask, pod driver.Pod, logf io.Writer, dir string) (string, error) {
args := []string{dir}
cmd := append([]string{toolboxContainerPath, "expanddir"}, args...)
// limit the template answer to max 1MiB
stdout := &bytes.Buffer{}
execConfig := &driver.ExecConfig{
Cmd: cmd,
Env: t.Spec.Environment,
User: stepUser(t),
AttachStdin: true,
Stdout: stdout,
Stderr: logf,
}
ce, err := pod.Exec(ctx, execConfig)
if err != nil {
return "", err
}
exitCode, err := ce.Wait(ctx)
if err != nil {
return "", err
}
if exitCode != 0 {
return "", errors.Errorf("expanddir ended with exit code %d", exitCode)
}
return stdout.String(), nil
}
func (e *Executor) mkdir(ctx context.Context, t *types.ExecutorTask, pod driver.Pod, logf io.Writer, dir string) error {
args := []string{dir}
cmd := append([]string{toolboxContainerPath, "mkdir"}, args...)
execConfig := &driver.ExecConfig{
Cmd: cmd,
Env: t.Spec.Environment,
User: stepUser(t),
AttachStdin: true,
Stdout: logf,
Stderr: logf,
}
ce, err := pod.Exec(ctx, execConfig)
if err != nil {
return err
}
exitCode, err := ce.Wait(ctx)
if err != nil {
return err
}
if exitCode != 0 {
return errors.Errorf("mkdir ended with exit code %d", exitCode)
}
return nil
}
func (e *Executor) template(ctx context.Context, t *types.ExecutorTask, pod driver.Pod, logf io.Writer, key string) (string, error) {
cmd := []string{toolboxContainerPath, "template"}
// limit the template answer to max 1MiB
stdout := util.NewLimitedBuffer(1024 * 1024)
workingDir, err := e.expandDir(ctx, t, pod, logf, t.Spec.WorkingDir)
if err != nil {
_, _ = io.WriteString(logf, fmt.Sprintf("failed to expand working dir %q. Error: %s\n", t.Spec.WorkingDir, err))
return "", err
}
execConfig := &driver.ExecConfig{
Cmd: cmd,
Env: t.Spec.Environment,
WorkingDir: workingDir,
User: stepUser(t),
AttachStdin: true,
Stdout: stdout,
Stderr: logf,
}
ce, err := pod.Exec(ctx, execConfig)
if err != nil {
return "", err
}
stdin := ce.Stdin()
go func() {
_, _ = io.WriteString(stdin, key)
stdin.Close()
}()
exitCode, err := ce.Wait(ctx)
if err != nil {
return "", err
}
if exitCode != 0 {
return "", errors.Errorf("template ended with exit code %d", exitCode)
}
return stdout.String(), nil
}
func (e *Executor) unarchive(ctx context.Context, t *types.ExecutorTask, source io.Reader, pod driver.Pod, logf io.Writer, destDir string, overwrite, removeDestDir bool) error {
args := []string{"--destdir", destDir}
if overwrite {
args = append(args, "--overwrite")
}
if removeDestDir {
args = append(args, "--remove-destdir")
}
cmd := append([]string{toolboxContainerPath, "unarchive"}, args...)
workingDir, err := e.expandDir(ctx, t, pod, logf, t.Spec.WorkingDir)
if err != nil {
_, _ = io.WriteString(logf, fmt.Sprintf("failed to expand working dir %q. Error: %s\n", t.Spec.WorkingDir, err))
return err
}
execConfig := &driver.ExecConfig{
Cmd: cmd,
Env: t.Spec.Environment,
WorkingDir: workingDir,
User: stepUser(t),
AttachStdin: true,
Stdout: logf,
Stderr: logf,
}
ce, err := pod.Exec(ctx, execConfig)
if err != nil {
return err
}
stdin := ce.Stdin()
go func() {
_, _ = io.Copy(stdin, source)
stdin.Close()
}()
exitCode, err := ce.Wait(ctx)
if err != nil {
return err
}
if exitCode != 0 {
return errors.Errorf("unarchive ended with exit code %d", exitCode)
}
return nil
}
func (e *Executor) doRestoreWorkspaceStep(ctx context.Context, s *types.RestoreWorkspaceStep, t *types.ExecutorTask, pod driver.Pod, logPath string) (int, error) {
if err := os.MkdirAll(filepath.Dir(logPath), 0770); err != nil {
return -1, err
}
logf, err := os.Create(logPath)
if err != nil {
return -1, err
}
defer logf.Close()
for _, op := range t.Spec.WorkspaceOperations {
log.Debugf("unarchiving workspace for taskID: %s, step: %d", level, op.TaskID, op.Step)
resp, err := e.runserviceClient.GetArchive(ctx, op.TaskID, op.Step)
if err != nil {
// TODO(sgotti) retry before giving up
fmt.Fprintf(logf, "error reading workspace archive: %v\n", err)
return -1, err
}
archivef := resp.Body
if err := e.unarchive(ctx, t, archivef, pod, logf, s.DestDir, false, false); err != nil {
archivef.Close()
return -1, err
}
archivef.Close()
}
return 0, nil
}
func (e *Executor) doSaveCacheStep(ctx context.Context, s *types.SaveCacheStep, t *types.ExecutorTask, pod driver.Pod, logPath string, archivePath string) (int, error) {
cmd := []string{toolboxContainerPath, "archive"}
if err := os.MkdirAll(filepath.Dir(logPath), 0770); err != nil {
return -1, err
}
logf, err := os.Create(logPath)
if err != nil {
return -1, err
}
defer logf.Close()
save := false
// calculate key from template
userKey, err := e.template(ctx, t, pod, logf, s.Key)
if err != nil {
return -1, err
}
fmt.Fprintf(logf, "cache key %q\n", userKey)
// append cache prefix
key := t.Spec.CachePrefix + "-" + userKey
// check that the cache key doesn't already exists
resp, err := e.runserviceClient.CheckCache(ctx, key, false)
if err != nil {
// ignore 404 errors since they means that the cache key doesn't exists
if resp != nil && resp.StatusCode == http.StatusNotFound {
fmt.Fprintf(logf, "no cache available for key %q. Saving.\n", userKey)
save = true
} else {
// TODO(sgotti) retry before giving up
fmt.Fprintf(logf, "error checking for cache key %q: %v\n", userKey, err)
return -1, err
}
}
if !save {
fmt.Fprintf(logf, "cache for key %q already exists\n", userKey)
return 0, nil
}
fmt.Fprintf(logf, "archiving cache with key %q\n", userKey)
if err := os.MkdirAll(filepath.Dir(archivePath), 0770); err != nil {
return -1, err
}
archivef, err := os.Create(archivePath)
if err != nil {
return -1, err
}
defer archivef.Close()
workingDir, err := e.expandDir(ctx, t, pod, logf, t.Spec.WorkingDir)
if err != nil {
_, _ = io.WriteString(logf, fmt.Sprintf("failed to expand working dir %q. Error: %s\n", t.Spec.WorkingDir, err))
return -1, err
}
execConfig := &driver.ExecConfig{
Cmd: cmd,
Env: t.Spec.Environment,
WorkingDir: workingDir,
User: stepUser(t),
AttachStdin: true,
Stdout: archivef,
Stderr: logf,
}
ce, err := pod.Exec(ctx, execConfig)
if err != nil {
return -1, err
}
type ArchiveInfo struct {
SourceDir string
DestDir string
Paths []string
}
type Archive struct {
ArchiveInfos []*ArchiveInfo
OutFile string
}
a := &Archive{
OutFile: "", // use stdout
ArchiveInfos: make([]*ArchiveInfo, len(s.Contents)),
}
for i, c := range s.Contents {
a.ArchiveInfos[i] = &ArchiveInfo{
SourceDir: c.SourceDir,
DestDir: c.DestDir,
Paths: c.Paths,
}
}
stdin := ce.Stdin()
enc := json.NewEncoder(stdin)
go func() {
_ = enc.Encode(a)
stdin.Close()
}()
exitCode, err := ce.Wait(ctx)
if err != nil {
return -1, err
}
if exitCode != 0 {
return exitCode, errors.Errorf("save cache archiving command ended with exit code %d", exitCode)
}
f, err := os.Open(archivePath)
if err != nil {
return -1, err
}
fi, err := f.Stat()
if err != nil {
return -1, err
}
// send cache archive to scheduler
if resp, err := e.runserviceClient.PutCache(ctx, key, fi.Size(), f); err != nil {
if resp != nil && resp.StatusCode == http.StatusNotModified {
return exitCode, nil
}
return -1, err
}
return exitCode, nil
}
func (e *Executor) doRestoreCacheStep(ctx context.Context, s *types.RestoreCacheStep, t *types.ExecutorTask, pod driver.Pod, logPath string) (int, error) {
if err := os.MkdirAll(filepath.Dir(logPath), 0770); err != nil {
return -1, err
}
logf, err := os.Create(logPath)
if err != nil {
return -1, err
}
defer logf.Close()
fmt.Fprintf(logf, "restoring cache: %s\n", util.Dump(s))
for _, key := range s.Keys {
// calculate key from template
userKey, err := e.template(ctx, t, pod, logf, key)
if err != nil {
return -1, err
}
fmt.Fprintf(logf, "cache key %q\n", userKey)
// append cache prefix
key := t.Spec.CachePrefix + "-" + userKey
resp, err := e.runserviceClient.GetCache(ctx, key, true)
if err != nil {
// ignore 404 errors since they means that the cache key doesn't exists
if resp != nil && resp.StatusCode == http.StatusNotFound {
fmt.Fprintf(logf, "no cache available for key %q\n", userKey)
continue
}
// TODO(sgotti) retry before giving up
fmt.Fprintf(logf, "error reading cache: %v\n", err)
return -1, err
}
fmt.Fprintf(logf, "restoring cache with key %q\n", userKey)
cachef := resp.Body
if err := e.unarchive(ctx, t, cachef, pod, logf, s.DestDir, false, false); err != nil {
cachef.Close()
return -1, err
}
cachef.Close()
// stop here
break
}
return 0, nil
}
func (e *Executor) executorIDPath() string {
return filepath.Join(e.c.DataDir, "id")
}
func (e *Executor) tasksDir() string {
return filepath.Join(e.c.DataDir, "tasks")
}
func (e *Executor) taskPath(taskID string) string {
return filepath.Join(e.tasksDir(), taskID)
}
func (e *Executor) taskLogsPath(taskID string) string {
return filepath.Join(e.tasksDir(), taskID, "logs")
}
func (e *Executor) setupLogPath(taskID string) string {
return filepath.Join(e.taskLogsPath(taskID), "setup.log")
}
func (e *Executor) stepLogPath(taskID string, stepID int) string {
return filepath.Join(e.taskLogsPath(taskID), "steps", fmt.Sprintf("%d.log", stepID))
}
func (e *Executor) archivePath(taskID string, stepID int) string {
return filepath.Join(e.taskPath(taskID), "archives", fmt.Sprintf("%d.tar", stepID))
}
func (e *Executor) sendExecutorStatus(ctx context.Context) error {
labels := e.c.Labels
if labels == nil {
labels = make(map[string]string)
}
activeTasks := e.runningTasks.len()
archs, err := e.driver.Archs(ctx)
if err != nil {
return err
}
executorGroup, err := e.driver.ExecutorGroup(ctx)
if err != nil {
return err
}
// report all the executors that are active OR that have some owned pods not yet removed
activeExecutors, err := e.driver.GetExecutors(ctx)
if err != nil {
return err
}
pods, err := e.driver.GetPods(ctx, true)
if err != nil {
return err
}
executorsMap := map[string]struct{}{}
for _, executorID := range activeExecutors {
executorsMap[executorID] = struct{}{}
}
for _, pod := range pods {
executorsMap[pod.ExecutorID()] = struct{}{}
}
siblingsExecutors := []string{}
for executorID := range executorsMap {
siblingsExecutors = append(siblingsExecutors, executorID)
}
executor := &types.Executor{
ID: e.id,
Archs: archs,
AllowPrivilegedContainers: e.c.AllowPrivilegedContainers,
ListenURL: e.listenURL,
Labels: labels,
ActiveTasksLimit: e.c.ActiveTasksLimit,
ActiveTasks: activeTasks,
Dynamic: e.dynamic,
ExecutorGroup: executorGroup,
SiblingsExecutors: siblingsExecutors,
}
log.Debugf("send executor status: %s", util.Dump(executor))
_, err = e.runserviceClient.SendExecutorStatus(ctx, executor)
return err
}
func (e *Executor) sendExecutorTaskStatus(ctx context.Context, et *types.ExecutorTask) error {
log.Debugf("send executor task: %s. status: %s", et.ID, et.Status.Phase)
_, err := e.runserviceClient.SendExecutorTaskStatus(ctx, e.id, et)
return err
}
func (e *Executor) stopTask(ctx context.Context, et *types.ExecutorTask) {
if rt, ok := e.runningTasks.get(et.ID); ok {
rt.Lock()
defer rt.Unlock()
if rt.et.Status.Phase.IsFinished() {
return
}
if rt.pod != nil {
if err := rt.pod.Stop(ctx); err != nil {
log.Errorf("err: %+v", err)
return
}
if rt.et.Status.Phase == types.ExecutorTaskPhaseNotStarted {
rt.et.Status.Phase = types.ExecutorTaskPhaseCancelled
} else {
rt.et.Status.Phase = types.ExecutorTaskPhaseStopped
}
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
log.Errorf("err: %+v", err)
return
}
}
}
}
func (e *Executor) executeTask(ctx context.Context, et *types.ExecutorTask) {
// * save in local state that we have a running task
// * start the pod
// * then update the executortask status to in-progress
// if something fails pod will be cleaned up by the pod cleaner goroutine
// In this way we are sure that the pod cleaner will only remove pod that don't
// have an in progress running task
if et.Status.Phase != types.ExecutorTaskPhaseNotStarted {
log.Debugf("task phase is not \"not started\"")
return
}
activeTasks := e.runningTasks.len()
// don't start task if we have reached the active tasks limit
// they will be executed later
if activeTasks > e.c.ActiveTasksLimit {
return
}
rt := &runningTask{
et: et,
executing: true,
}
rt.Lock()
if !e.runningTasks.addIfNotExists(et.ID, rt) {
log.Debugf("task %s already running", et.ID)
rt.Unlock()
return
}
defer func() {
rt.Lock()
rt.executing = false
rt.Unlock()
}()
et.Status.Phase = types.ExecutorTaskPhaseRunning
et.Status.StartTime = util.TimeP(time.Now())
et.Status.SetupStep.Phase = types.ExecutorTaskPhaseRunning
et.Status.SetupStep.StartTime = util.TimeP(time.Now())
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
log.Errorf("err: %+v", err)
}
if err := e.setupTask(ctx, rt); err != nil {
log.Errorf("err: %+v", err)
et.Status.Phase = types.ExecutorTaskPhaseFailed
et.Status.EndTime = util.TimeP(time.Now())
et.Status.SetupStep.Phase = types.ExecutorTaskPhaseFailed
et.Status.SetupStep.EndTime = util.TimeP(time.Now())
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
log.Errorf("err: %+v", err)
}
rt.Unlock()
return
}
et.Status.SetupStep.Phase = types.ExecutorTaskPhaseSuccess
et.Status.SetupStep.EndTime = util.TimeP(time.Now())
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
log.Errorf("err: %+v", err)
}
rt.Unlock()
_, err := e.executeTaskSteps(ctx, rt, rt.pod)
rt.Lock()
if err != nil {
log.Errorf("err: %+v", err)
et.Status.Phase = types.ExecutorTaskPhaseFailed
} else {
et.Status.Phase = types.ExecutorTaskPhaseSuccess
}
et.Status.EndTime = util.TimeP(time.Now())
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
log.Errorf("err: %+v", err)
}
rt.Unlock()
}
func (e *Executor) setupTask(ctx context.Context, rt *runningTask) error {
et := rt.et
if err := os.RemoveAll(e.taskPath(et.ID)); err != nil {
return err
}
if err := os.MkdirAll(e.taskPath(et.ID), 0770); err != nil {
return err
}
setupLogPath := e.setupLogPath(et.ID)
if err := os.MkdirAll(filepath.Dir(setupLogPath), 0770); err != nil {
return err
}
outf, err := os.Create(setupLogPath)
if err != nil {
return err
}
defer outf.Close()
// error out if privileged containers are required but not allowed
requiresPrivilegedContainers := false
for _, c := range et.Spec.Containers {
if c.Privileged {
requiresPrivilegedContainers = true
break
}
}
if requiresPrivilegedContainers && !e.c.AllowPrivilegedContainers {
_, _ = outf.WriteString("Executor doesn't allow executing privileged containers.\n")
return errors.Errorf("executor doesn't allow executing privileged containers")
}
log.Debugf("starting pod")
dockerConfig, err := registry.GenDockerConfig(et.Spec.DockerRegistriesAuth, []string{et.Spec.Containers[0].Image})
if err != nil {
return err
}
podConfig := &driver.PodConfig{
// generate a random pod id (don't use task id for future ability to restart
// tasks failed to start and don't clash with existing pods)
ID: uuid.NewV4().String(),
TaskID: et.ID,
Arch: et.Spec.Arch,
InitVolumeDir: toolboxContainerDir,
DockerConfig: dockerConfig,
Containers: make([]*driver.ContainerConfig, len(et.Spec.Containers)),
}
for i, c := range et.Spec.Containers {
var cmd []string
if i == 0 {
cmd = []string{toolboxContainerPath, "sleeper"}
}
if c.Entrypoint != "" {
cmd = strings.Split(c.Entrypoint, " ")
}
containerConfig := &driver.ContainerConfig{
Image: c.Image,
Cmd: cmd,
Env: c.Environment,
User: c.User,
Privileged: c.Privileged,
Volumes: make([]driver.Volume, len(c.Volumes)),
}
for vIndex, cVol := range c.Volumes {
containerConfig.Volumes[vIndex] = driver.Volume{
Path: cVol.Path,
}
if cVol.TmpFS != nil {
containerConfig.Volumes[vIndex].TmpFS = &driver.VolumeTmpFS{
Size: cVol.TmpFS.Size,
}
}
}
podConfig.Containers[i] = containerConfig
}
_, _ = outf.WriteString("Starting pod.\n")
pod, err := e.driver.NewPod(ctx, podConfig, outf)
if err != nil {
_, _ = outf.WriteString(fmt.Sprintf("Pod failed to start. Error: %s\n", err))
return err
}
_, _ = outf.WriteString("Pod started.\n")
if et.Spec.WorkingDir != "" {
_, _ = outf.WriteString(fmt.Sprintf("Creating working dir %q.\n", et.Spec.WorkingDir))
if err := e.mkdir(ctx, et, pod, outf, et.Spec.WorkingDir); err != nil {
_, _ = outf.WriteString(fmt.Sprintf("Failed to create working dir %q. Error: %s\n", et.Spec.WorkingDir, err))
return err
}
}
rt.pod = pod
return nil
}
func (e *Executor) executeTaskSteps(ctx context.Context, rt *runningTask, pod driver.Pod) (int, error) {
for i, step := range rt.et.Spec.Steps {
rt.Lock()
rt.et.Status.Steps[i].Phase = types.ExecutorTaskPhaseRunning
rt.et.Status.Steps[i].StartTime = util.TimeP(time.Now())
if err := e.sendExecutorTaskStatus(ctx, rt.et); err != nil {
log.Errorf("err: %+v", err)
}
rt.Unlock()
var err error
var exitCode int
var stepName string
switch s := step.(type) {
case *types.RunStep:
log.Debugf("run step: %s", util.Dump(s))
stepName = s.Name
exitCode, err = e.doRunStep(ctx, s, rt.et, pod, e.stepLogPath(rt.et.ID, i))
case *types.SaveToWorkspaceStep:
log.Debugf("save to workspace step: %s", util.Dump(s))
stepName = s.Name
archivePath := e.archivePath(rt.et.ID, i)
exitCode, err = e.doSaveToWorkspaceStep(ctx, s, rt.et, pod, e.stepLogPath(rt.et.ID, i), archivePath)
case *types.RestoreWorkspaceStep:
log.Debugf("restore workspace step: %s", util.Dump(s))
stepName = s.Name
exitCode, err = e.doRestoreWorkspaceStep(ctx, s, rt.et, pod, e.stepLogPath(rt.et.ID, i))
case *types.SaveCacheStep:
log.Debugf("save cache step: %s", util.Dump(s))
stepName = s.Name
archivePath := e.archivePath(rt.et.ID, i)
exitCode, err = e.doSaveCacheStep(ctx, s, rt.et, pod, e.stepLogPath(rt.et.ID, i), archivePath)
case *types.RestoreCacheStep:
log.Debugf("restore cache step: %s", util.Dump(s))
stepName = s.Name
exitCode, err = e.doRestoreCacheStep(ctx, s, rt.et, pod, e.stepLogPath(rt.et.ID, i))
default:
return i, errors.Errorf("unknown step type: %s", util.Dump(s))
}
var serr error
rt.Lock()
rt.et.Status.Steps[i].EndTime = util.TimeP(time.Now())
rt.et.Status.Steps[i].Phase = types.ExecutorTaskPhaseSuccess
if err != nil {
if rt.et.Spec.Stop {
rt.et.Status.Steps[i].Phase = types.ExecutorTaskPhaseStopped
} else {
rt.et.Status.Steps[i].Phase = types.ExecutorTaskPhaseFailed
}
serr = errors.Errorf("failed to execute step %s: %w", util.Dump(step), err)
} else if exitCode != 0 {
rt.et.Status.Steps[i].Phase = types.ExecutorTaskPhaseFailed
rt.et.Status.Steps[i].ExitStatus = util.IntP(exitCode)
serr = errors.Errorf("step %q failed with exitcode %d", stepName, exitCode)
} else if exitCode == 0 {
rt.et.Status.Steps[i].ExitStatus = util.IntP(exitCode)
}
if err := e.sendExecutorTaskStatus(ctx, rt.et); err != nil {
log.Errorf("err: %+v", err)
}
rt.Unlock()
if serr != nil {
return i, serr
}
}
return 0, nil
}
func (e *Executor) podsCleanerLoop(ctx context.Context) {
for {
log.Debugf("podsCleaner")
if err := e.podsCleaner(ctx); err != nil {
log.Errorf("err: %+v", err)
}
sleepCh := time.NewTimer(1 * time.Second).C
select {
case <-ctx.Done():
return
case <-sleepCh:
}
}
}
func (e *Executor) podsCleaner(ctx context.Context) error {
pods, err := e.getAllPods(ctx, true)
if err != nil {
return err
}
executors, err := e.driver.GetExecutors(ctx)
if err != nil {
return err
}
// always add ourself to executors
executors = append(executors, e.id)
for _, pod := range pods {
taskID := pod.TaskID()
// clean our owned pods
if pod.ExecutorID() == e.id {
if _, ok := e.runningTasks.get(taskID); !ok {
log.Infof("removing pod %s for not running task: %s", pod.ID(), taskID)
_ = pod.Remove(ctx)
}
}
// if no executor owns the pod we'll delete it
owned := false
for _, executorID := range executors {
if pod.ExecutorID() == executorID {
owned = true
break
}
}
if !owned {
log.Infof("removing pod %s since it's not owned by any active executor", pod.ID())
_ = pod.Remove(ctx)
}
}
return nil
}
func (e *Executor) executorStatusSenderLoop(ctx context.Context) {
for {
log.Debugf("executorStatusSenderLoop")
if err := e.sendExecutorStatus(ctx); err != nil {
log.Errorf("err: %+v", err)
}
sleepCh := time.NewTimer(2 * time.Second).C
select {
case <-ctx.Done():
return
case <-sleepCh:
}
}
}
func (e *Executor) executorTasksStatusSenderLoop(ctx context.Context) {
for {
log.Debugf("executorTasksStatusSenderLoop")
for _, rtID := range e.runningTasks.ids() {
rt, ok := e.runningTasks.get(rtID)
if !ok {
continue
}
rt.Lock()
if err := e.sendExecutorTaskStatus(ctx, rt.et); err != nil {
log.Errorf("err: %+v", err)
rt.Unlock()
continue
}
// remove running task if send was successful and it's not executing
if !rt.executing {
e.runningTasks.delete(rtID)
}
rt.Unlock()
}
sleepCh := time.NewTimer(2 * time.Second).C
select {
case <-ctx.Done():
return
case <-sleepCh:
}
}
}
func (e *Executor) tasksUpdaterLoop(ctx context.Context) {
for {
log.Debugf("tasksUpdater")
if err := e.tasksUpdater(ctx); err != nil {
log.Errorf("err: %+v", err)
}
sleepCh := time.NewTimer(2 * time.Second).C
select {
case <-ctx.Done():
return
case <-sleepCh:
}
}
}
// taskUpdater fetches the executor tasks from the scheduler and handles them
// this is useful to catch up when some tasks submissions from the scheduler to the executor
// APIs fails
func (e *Executor) tasksUpdater(ctx context.Context) error {
ets, _, err := e.runserviceClient.GetExecutorTasks(ctx, e.id)
if err != nil {
log.Warnf("err: %v", err)
return err
}
log.Debugf("ets: %v", util.Dump(ets))
for _, et := range ets {
go e.taskUpdater(ctx, et)
}
// remove runningTasks not existing in the runservice
etIDsMap := map[string]struct{}{}
for _, et := range ets {
etIDsMap[et.ID] = struct{}{}
}
for _, rtID := range e.runningTasks.ids() {
if _, ok := etIDsMap[rtID]; !ok {
e.runningTasks.delete(rtID)
}
}
return nil
}
func (e *Executor) taskUpdater(ctx context.Context, et *types.ExecutorTask) {
log.Debugf("et: %v", util.Dump(et))
if et.Spec.ExecutorID != e.id {
return
}
if et.Spec.Stop {
e.stopTask(ctx, et)
}
if et.Status.Phase == types.ExecutorTaskPhaseNotStarted {
e.executeTask(ctx, et)
}
if et.Status.Phase == types.ExecutorTaskPhaseRunning {
_, ok := e.runningTasks.get(et.ID)
if !ok {
log.Infof("marking executor task %s as failed since there's no running task", et.ID)
et.Status.Phase = types.ExecutorTaskPhaseFailed
et.Status.EndTime = util.TimeP(time.Now())
// mark in progress step as failed too
for _, s := range et.Status.Steps {
if s.Phase == types.ExecutorTaskPhaseRunning {
s.Phase = types.ExecutorTaskPhaseFailed
s.EndTime = util.TimeP(time.Now())
}
}
if err := e.sendExecutorTaskStatus(ctx, et); err != nil {
log.Errorf("err: %+v", err)
}
}
}
}
func (e *Executor) tasksDataCleanerLoop(ctx context.Context) {
for {
log.Debugf("tasksDataCleaner")
if err := e.tasksDataCleaner(ctx); err != nil {
log.Errorf("err: %+v", err)
}
sleepCh := time.NewTimer(2 * time.Second).C
select {
case <-ctx.Done():
return
case <-sleepCh:
}
}
}
func (e *Executor) tasksDataCleaner(ctx context.Context) error {
entries, err := ioutil.ReadDir(e.tasksDir())
if err != nil {
return err
}
for _, entry := range entries {
if !entry.IsDir() {
continue
}
etID := filepath.Base(entry.Name())
_, resp, err := e.runserviceClient.GetExecutorTask(ctx, e.id, etID)
if err != nil {
if resp == nil {
return err
}
if resp.StatusCode != http.StatusNotFound {
return err
}
}
if resp.StatusCode == http.StatusNotFound {
taskDir := e.taskPath(etID)
log.Infof("removing task dir %q", taskDir)
// remove task dir
if err := os.RemoveAll(taskDir); err != nil {
return err
}
}
}
return nil
}
type runningTasks struct {
tasks map[string]*runningTask
m sync.Mutex
}
type runningTask struct {
sync.Mutex
et *types.ExecutorTask
pod driver.Pod
executing bool
}
func (r *runningTasks) get(rtID string) (*runningTask, bool) {
r.m.Lock()
defer r.m.Unlock()
rt, ok := r.tasks[rtID]
return rt, ok
}
func (r *runningTasks) addIfNotExists(rtID string, rt *runningTask) bool {
r.m.Lock()
defer r.m.Unlock()
if _, ok := r.tasks[rtID]; ok {
return false
}
r.tasks[rtID] = rt
return true
}
func (r *runningTasks) delete(rtID string) {
r.m.Lock()
defer r.m.Unlock()
delete(r.tasks, rtID)
}
func (r *runningTasks) len() int {
r.m.Lock()
defer r.m.Unlock()
return len(r.tasks)
}
func (r *runningTasks) ids() []string {
ids := []string{}
r.m.Lock()
defer r.m.Unlock()
for id := range r.tasks {
ids = append(ids, id)
}
return ids
}
func (e *Executor) handleTasks(ctx context.Context, c <-chan *types.ExecutorTask) {
for et := range c {
go e.executeTask(ctx, et)
}
}
func (e *Executor) getExecutorID() (string, error) {
id, err := ioutil.ReadFile(e.executorIDPath())
if err != nil && !os.IsNotExist(err) {
return "", err
}
return string(id), nil
}
func (e *Executor) saveExecutorID(id string) error {
if err := common.WriteFileAtomic(e.executorIDPath(), []byte(id), 0660); err != nil {
return errors.Errorf("failed to write executor id file: %w", err)
}
return nil
}
type Executor struct {
c *config.Executor
runserviceClient *rsclient.Client
id string
runningTasks *runningTasks
driver driver.Driver
listenAddress string
listenURL string
dynamic bool
}
func NewExecutor(c *config.Executor) (*Executor, error) {
if c.Debug {
level.SetLevel(zapcore.DebugLevel)
}
var err error
c.ToolboxPath, err = filepath.Abs(c.ToolboxPath)
if err != nil {
return nil, errors.Errorf("cannot determine \"agola-toolbox\" absolute path: %w", err)
}
e := &Executor{
c: c,
runserviceClient: rsclient.NewClient(c.RunserviceURL),
runningTasks: &runningTasks{
tasks: make(map[string]*runningTask),
},
}
if err := os.MkdirAll(e.tasksDir(), 0770); err != nil {
return nil, err
}
id, err := e.getExecutorID()
if err != nil {
return nil, err
}
if id == "" {
id = uuid.NewV4().String()
if err := e.saveExecutorID(id); err != nil {
return nil, err
}
}
e.id = id
// TODO(sgotti) now the first available private ip will be used and the executor will bind to the wildcard address
// improve this to let the user define the bind and the advertize address
addr, err := sockaddr.GetPrivateIP()
if err != nil {
return nil, errors.Errorf("cannot discover executor listen address: %w", err)
}
if addr == "" {
return nil, errors.Errorf("cannot discover executor listen address")
}
u := url.URL{Scheme: "http"}
if c.Web.TLS {
u.Scheme = "https"
}
_, port, err := net.SplitHostPort(c.Web.ListenAddress)
if err != nil {
return nil, errors.Errorf("cannot get web listen port: %w", err)
}
u.Host = net.JoinHostPort(addr, port)
e.listenURL = u.String()
e.listenAddress = fmt.Sprintf(":%s", port)
var d driver.Driver
switch c.Driver.Type {
case config.DriverTypeDocker:
d, err = driver.NewDockerDriver(logger, e.id, e.c.ToolboxPath)
if err != nil {
return nil, errors.Errorf("failed to create docker driver: %w", err)
}
case config.DriverTypeK8s:
d, err = driver.NewK8sDriver(logger, e.id, c.ToolboxPath)
if err != nil {
return nil, errors.Errorf("failed to create kubernetes driver: %w", err)
}
e.dynamic = true
default:
return nil, errors.Errorf("unknown driver type %q", c.Driver.Type)
}
e.driver = d
return e, nil
}
func (e *Executor) Run(ctx context.Context) error {
if err := e.driver.Setup(ctx); err != nil {
return err
}
ch := make(chan *types.ExecutorTask)
schedulerHandler := NewTaskSubmissionHandler(ch)
logsHandler := NewLogsHandler(logger, e)
archivesHandler := NewArchivesHandler(e)
router := mux.NewRouter()
apirouter := router.PathPrefix("/api/v1alpha").Subrouter()
apirouter.Handle("/executor", schedulerHandler).Methods("POST")
apirouter.Handle("/executor/logs", logsHandler).Methods("GET")
apirouter.Handle("/executor/archives", archivesHandler).Methods("GET")
go e.executorStatusSenderLoop(ctx)
go e.executorTasksStatusSenderLoop(ctx)
go e.podsCleanerLoop(ctx)
go e.tasksUpdaterLoop(ctx)
go e.tasksDataCleanerLoop(ctx)
go e.handleTasks(ctx, ch)
httpServer := http.Server{
Addr: e.listenAddress,
Handler: apirouter,
}
lerrCh := make(chan error)
go func() {
lerrCh <- httpServer.ListenAndServe()
}()
select {
case <-ctx.Done():
log.Infof("runservice executor exiting")
httpServer.Close()
case err := <-lerrCh:
if err != nil {
log.Errorf("http server listen error: %v", err)
return err
}
}
return nil
}