Skip to content

Commit

Permalink
(a12) allow hot-appl reload
Browse files Browse the repository at this point in the history
This should stitch things together from HEAD~2,3. By extending the
monitoring mode and watchdog handling arcan-net can now interrupt
the chainloaded arcan process both while busy in lua-vm state and
elsewhere in the processing.

This adds a reload command to the monitoring mode, as well as unpacking
an update appl into a .new folder. When the unpack is completed,
chainloaded arcan jumps into monitoring mode, -net switches applname.new
over to just applname and proceeds to tell arcan to reload the current
app. This triggers a behaviour similar to system_collapse(self) with the
regular adopt etc. procedure.

There are more variants to handle with this - e.g. allowing the user
to block this behaviour, to handle revert should the new appl be buggy
and so on but this should be enough to test the general behaviour in a
more networked setting.
  • Loading branch information
letoram committed Oct 5, 2023
1 parent 80aef77 commit c3f0bee
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 38 deletions.
99 changes: 85 additions & 14 deletions src/a12/net/dir_cl.c
Expand Up @@ -16,6 +16,7 @@
#include <inttypes.h>
#include <stdint.h>
#include <signal.h>
#include <stdatomic.h>

#include "../a12.h"
#include "../a12_int.h"
Expand Down Expand Up @@ -78,6 +79,12 @@ static bool clean_appldir(const char* name, int basedir)
fchdir(basedir);
}

char buf[strlen(name) + sizeof(".new")];
if (atomic_load(&active_appls.n_active) > 0){
snprintf(buf, sizeof(buf), "%s.new", name);
nftw(buf, cleancb, 32, FTW_DEPTH | FTW_PHYS);
}

/* more careful would get the current pressure through rlimits and sweep until
* we know how many real slots are available and break at that, better option
* still would be to just keep this in a memfs like setup and rebuild the
Expand All @@ -93,6 +100,12 @@ static bool ensure_appldir(const char* name, int basedir)
fchdir(basedir);
}

char buf[strlen(name) + sizeof(".new")];
if (atomic_load(&active_appls.n_active) > 0){
snprintf(buf, sizeof(buf), "%s.new", name);
name = buf;
}

/* make sure we don't have a collision */
clean_appldir(name, basedir);

Expand Down Expand Up @@ -189,6 +202,7 @@ static pid_t exec_cpath(struct a12_state* S,
if (ctx->key)
setenv(ctx->key, ctx->val, 1);

setsid();
setenv("XDG_RUNTIME_DIR", "./", 1);
dup2(pstdin[0], STDIN_FILENO);
close(pstdin[0]);
Expand All @@ -213,18 +227,59 @@ static pid_t exec_cpath(struct a12_state* S,
return pid;
}

static void swap_appldir(const char* name, int basedir)
{
/* it is probably worth keeping track of the old here */
size_t sz = strlen(name) + sizeof(".new");
char buf[sz];
snprintf(buf, sz, "%s.new", name);

if (-1 != basedir){
fchdir(basedir);
}

nftw(name, cleancb, 32, FTW_DEPTH | FTW_PHYS);
rename(buf, name);
}

/*
* This will only trigger when there's data / shutdown from arcan.
* With the current monitor mode setup that is only on script errors
* or legitimate shutdowns, so safe to treat this as blocking.
* This will only trigger when there's data / shutdown from arcan. With the
* current monitor mode setup that is only on script errors or legitimate
* shutdowns, so safe to treat this as blocking.
*/
static void process_thread(struct ioloop_shared* I, bool ok)
{
char buf[4096];

/* capture the state block, write into an unlinked tmp-file so the
* file descriptor can be rewound and set as a bstream */
struct appl_runner_state* A = I->tag;
struct directory_meta* cbt = I->cbt;

/* we are trying to synch a reload, now's the time to remove the old appldir
* and rename the .new into just basename. */
if (ok){
if (fgets(buf, 4096, A->pf_stdout)){
if (strcmp(buf, "#LOCKED\n") == 0){
swap_appldir(cbt->clopt->applname, cbt->clopt->basedir);

/* the first 'continue here is to unlock the reload, i.e. we don't want to
* buffer more commands in the same atomic commit. this causes recovery into
* something that immediately switches into monitoring mode again in order
* to provide a window for re-injecting state */
fprintf(A->pf_stdin, "reload\n");
fprintf(A->pf_stdin, "continue\n");

/* state is already in database so just continue again, here is where we
* could do some other funky things, e.g. force a rollback to an externally
* defined override state. */
fprintf(A->pf_stdin, "continue\n");
}
}

return;
}

int pret = 0;
char* out = NULL;
char filename[] = "statetemp-XXXXXX";
Expand All @@ -238,8 +293,6 @@ static void process_thread(struct ioloop_shared* I, bool ok)
unlink(filename);

while (!feof(A->pf_stdout)){
char buf[4096];

/* couldn't get more state, STDOUT is likely broken - process dead or dying */
if (!fgets(buf, 4096, A->pf_stdout)){
while ((waitpid(A->pid, &pret, 0)) != A->pid
Expand Down Expand Up @@ -487,10 +540,19 @@ static void mark_xfer_complete(struct ioloop_shared* I, struct a12_bhandler_meta
cbt->appl_out = NULL;
cbt->appl_out_complete = false;

/* Setup so it is threadable, though unlikely that useful versus just
* having userfd and multiplex appl-a12 that way. It would be for the
* case of running multiple appls over the same channel so keep that
* door open. */
/* Setup so it is threadable, though unlikely that useful versus just having
* userfd and multiplex appl-a12 that way. It would be for the case of running
* multiple appls over the same channel so keep that door open. Right now we
* work on the idea that if a new appl is received we should just force the
* other end to reload. We do this by queueing the command, waking the watchdog
* and waiting for it to reply to us via userfd */
if (atomic_load(&active_appls.n_active) > 0){
fprintf(active_appls.active.pf_stdin, "lock\n");
kill(active_appls.active.pid, SIGUSR1);
fprintf(stderr, "signalling=%d\n", active_appls.active.pid);
return;
}

atomic_fetch_add(&active_appls.n_active, 1);
active_appls.active.ios = I;

Expand Down Expand Up @@ -547,16 +609,23 @@ struct a12_bhandler_res anet_directory_cl_bhandler(
return res;
}

/* This can also happen if there was a new appl announced while we were busy
* unpacking the previous one, the dirstate event triggers the bin request
* triggers new initialize. Options are to cancel the current form, ignore
* the update or defer the request for a new one. The current implementation
* is to defer and happens in dir-state. */
if (cbt->appl_out){
fprintf(stderr, "Appl transfer initiated while one was pending\n");
return res;
}

/* FIXME:
* if we are already running an appl, we need an applname.new that we
* (near) atomically switch over to when unpack is ready then force a
* reload.
*/
/* We already have an appl running, this can be two things - one we have a
* resource intended for the appl itself. While still unhandled that is easy,
* setup a new descriptor link, BCHUNK_IN/OUT it into the shmif connection
* marked streaming and we are good to go.
*
* In the case of an appl we should verify that we wanted hot reloading,
* ensure_appldir into new and set atomic-swap on completion. */
if (!ensure_appldir(cbt->clopt->applname, cbt->clopt->basedir)){
fprintf(stderr, "Couldn't create temporary appl directory\n");
return res;
Expand All @@ -566,6 +635,8 @@ struct a12_bhandler_res anet_directory_cl_bhandler(
res.fd = fileno(cbt->appl_out);
res.flag = A12_BHANDLER_NEWFD;

/* these should really just enforce basedir and never use relative, it is
* just some initial hack thing that survived. */
if (-1 != cbt->clopt->basedir)
fchdir(cbt->clopt->basedir);
else
Expand Down
82 changes: 61 additions & 21 deletions src/engine/arcan_monitor.c
Expand Up @@ -7,30 +7,20 @@ static FILE* m_out;
static FILE* m_ctrl;
static bool m_locked;
static bool m_transaction;
static int longjmp_mode;

/*
* Might just be a point to use shmif client API here and in arcan-net end use
* shmif inherited there as well. The slight annoyance is the conflict with
* LWA getting its primary connection.
* instead of adding more commands here, the saner option is to establish
* a shmif based control interface (with the interesting consequence of
* being able to migrate that channel dynamically).
*
* The other option would be to explicitly name a connection point and have two
* shmif connections, one for the monitor controls and one for the regular LWA.
* With all the other monitor- uses and lwa interactions that might be easiest
* as a command though (lwa- subsegment push is problematic as it might be a
* native arcan running.
*
* That makes more sense structurally then having this separate setup, especially
* as more entry points into scripts might be needed - e.g. state transfer.
*
* This would also mesh better with other debugging tools.
*
* fgets ->
* need command for:
* database externally modified (new namespaces, EVENT_SYSTEM)
* debug-controls
* (single stepping, add breakpoint, tracing)
* force-reset (can longjmp to recover)
* soft shutdown (enqueue EVENT_SYSTEM_EXIT)
* lua-statesnap
* mask function
* add / run hookscript
* Otherwise the option is to start with a primary connection coming from the
* monitor and NEWSEGMENT the control over that. The problem then is migration
* of the primary when that is coming from an outer UI.
*/

static void cmd_dumpkeys(char* arg)
Expand All @@ -47,6 +37,30 @@ static void cmd_dumpkeys(char* arg)
fflush(m_out);
}

static void cmd_reload(char* arg)
{
/* signal verifyload- state, wrong sig */
char* res = arcan_expand_resource("", RESOURCE_APPL);

const char* errc;
if (!arcan_verifyload_appl(res, &errc)){
arcan_mem_free(res);
fprintf(m_out, "#ERROR %s\n", *errc);
fflush(m_out);
return;
}

arcan_mem_free(res);

/* this should correspond to a system_collapse(self) with a possible copy
* of the source appl to revert back into the previously stable copy. The
* problem with these tactics is our namespace enforcement of applname/..
* so we'd need a .suffix for this to work compatibility wise. */

/* will trigger on next continue; */
longjmp_mode = ARCAN_LUA_SWITCH_APPL;
}

static void cmd_loadkey(char* arg)
{
if (!arg[0] || arg[0] == '\n')
Expand Down Expand Up @@ -88,6 +102,13 @@ static void cmd_commit(char* arg)
m_transaction = false;
}

static void cmd_lock(char* arg)
{
/* no-op, m_locked already set */
fprintf(m_out, "#LOCKED\n");
fflush(m_out);
}

static void cmd_continue(char* arg)
{
m_locked = false;
Expand Down Expand Up @@ -115,14 +136,17 @@ void arcan_monitor_watchdog(lua_State* L, lua_Debug* D)
{"dumpkeys", cmd_dumpkeys},
{"loadkey", cmd_loadkey},
{"commit", cmd_commit},
{"reload", cmd_reload},
{"lock", cmd_lock}
};

m_locked = true;

do {
char buf[4096];
if (!fgets(buf, 4096, m_ctrl)){
longjmp(arcanmain_recover_state, ARCAN_LUA_KILL_SILENT);
longjmp_mode = ARCAN_LUA_KILL_SILENT;
break;
}
/* no funny / advanced format here, just command\sarg*/
size_t i = 0;
Expand All @@ -139,6 +163,12 @@ void arcan_monitor_watchdog(lua_State* L, lua_Debug* D)
} while (m_locked);

arcan_conductor_toggle_watchdog();

if (longjmp_mode){
int mode = longjmp_mode;
longjmp_mode = 0;
longjmp(arcanmain_recover_state, mode);
}
}

bool arcan_monitor_configure(int srate, const char* dst, FILE* ctrl)
Expand Down Expand Up @@ -206,6 +236,16 @@ void arcan_monitor_tick()
{
static size_t count;

if (m_ctrl){
struct pollfd pfd = {
.fd = STDIN_FILENO,
.events = POLLIN
};
if (1 == poll(&pfd, 1, 0)){
arcan_monitor_watchdog(NULL, NULL);
}
}

if (m_srate <= 0)
return;

Expand Down
6 changes: 3 additions & 3 deletions src/platform/posix/appl.c
Expand Up @@ -79,15 +79,15 @@ bool arcan_verifyload_appl(const char* appl_id, const char** errc)
app_len = strlen(base);
for (size_t i = 0; i < app_len; i++){
if (!isalnum(base[i]) && base[i] != '_'){
*errc = "invalid character in appl_id (only a..Z _ 0..9 allowed)\n";
*errc = "invalid character in appl_id (only a..Z _ 0..9 allowed)";
return false;
}
}

if (expand){
char* dir = arcan_expand_resource(base, RESOURCE_SYS_APPLBASE);
if (!dir){
*errc = "missing application base\n";
*errc = "missing application base";
free(base);
return false;
}
Expand All @@ -96,7 +96,7 @@ bool arcan_verifyload_appl(const char* appl_id, const char** errc)

dir = arcan_expand_resource(base, RESOURCE_SYS_APPLSTORE);
if (!dir){
*errc = "missing application temporary store\n";
*errc = "missing application temporary store";
free(base);
return false;
}
Expand Down
3 changes: 3 additions & 0 deletions src/shmif/platform/exec.c
Expand Up @@ -214,6 +214,9 @@ pid_t shmif_platform_execve(int fd, const char* shmif_key,
if ((opts & 1) && (pid = fork()) != 0)
_exit(pid > 0 ? EXIT_SUCCESS : EXIT_FAILURE);


setsid();

/* GNU or BSD4.2 */
execve(path, argv, new_env);
_exit(EXIT_FAILURE);
Expand Down

0 comments on commit c3f0bee

Please sign in to comment.