From b1712fabd1702640b04b0acdbba2d78294313a4d Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 12 Jan 2023 17:48:43 +0100 Subject: [PATCH 1/6] sd-journal: add high-level API for querying seqnum for journal entries, along with seqnum id --- src/libsystemd/libsystemd.sym | 1 + src/libsystemd/sd-journal/sd-journal.c | 31 ++++++++++++++++++++++++++ src/systemd/sd-journal.h | 1 + 3 files changed, 33 insertions(+) diff --git a/src/libsystemd/libsystemd.sym b/src/libsystemd/libsystemd.sym index 0f2230a42f9..802009b7100 100644 --- a/src/libsystemd/libsystemd.sym +++ b/src/libsystemd/libsystemd.sym @@ -810,4 +810,5 @@ global: sd_pidfd_get_unit; sd_pidfd_get_user_slice; sd_pidfd_get_user_unit; + sd_journal_get_seqnum; } LIBSYSTEMD_252; diff --git a/src/libsystemd/sd-journal/sd-journal.c b/src/libsystemd/sd-journal/sd-journal.c index 659e1e4e348..61249816a4d 100644 --- a/src/libsystemd/sd-journal/sd-journal.c +++ b/src/libsystemd/sd-journal/sd-journal.c @@ -2228,6 +2228,37 @@ _public_ int sd_journal_get_monotonic_usec(sd_journal *j, uint64_t *ret, sd_id12 return 0; } +_public_ int sd_journal_get_seqnum( + sd_journal *j, + uint64_t *ret_seqnum, + sd_id128_t *ret_seqnum_id) { + + JournalFile *f; + Object *o; + int r; + + assert_return(j, -EINVAL); + assert_return(!journal_pid_changed(j), -ECHILD); + + f = j->current_file; + if (!f) + return -EADDRNOTAVAIL; + + if (f->current_offset <= 0) + return -EADDRNOTAVAIL; + + r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o); + if (r < 0) + return r; + + if (ret_seqnum_id) + *ret_seqnum_id = f->header->seqnum_id; + if (ret_seqnum) + *ret_seqnum = le64toh(o->entry.seqnum); + + return 0; +} + static bool field_is_valid(const char *field) { assert(field); diff --git a/src/systemd/sd-journal.h b/src/systemd/sd-journal.h index 24638f1e0ab..24e67663b95 100644 --- a/src/systemd/sd-journal.h +++ b/src/systemd/sd-journal.h @@ -99,6 +99,7 @@ int sd_journal_next_skip(sd_journal *j, uint64_t skip); int sd_journal_get_realtime_usec(sd_journal *j, uint64_t *ret); int sd_journal_get_monotonic_usec(sd_journal *j, uint64_t *ret, sd_id128_t *ret_boot_id); +int sd_journal_get_seqnum(sd_journal *j, uint64_t *ret_seqnum, sd_id128_t *ret_seqnum_id); int sd_journal_set_data_threshold(sd_journal *j, size_t sz); int sd_journal_get_data_threshold(sd_journal *j, size_t *sz); From 2bc70e2e9db13cfbc74e4bf1fd28ecbfef1466c9 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 19 Jan 2023 22:42:03 +0100 Subject: [PATCH 2/6] logs-show: show seqnum info in export+json output mode --- src/shared/journal-importer.c | 6 +++--- src/shared/logs-show.c | 39 ++++++++++++++++++++++++++++------- test/units/testsuite-04.sh | 2 +- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/src/shared/journal-importer.c b/src/shared/journal-importer.c index d9eabec886f..83e9834bbf2 100644 --- a/src/shared/journal-importer.c +++ b/src/shared/journal-importer.c @@ -14,6 +14,7 @@ #include "journal-util.h" #include "parse-util.h" #include "string-util.h" +#include "strv.h" #include "unaligned.h" enum { @@ -217,9 +218,8 @@ static int process_special_field(JournalImporter *imp, char *line) { assert(line); - value = startswith(line, "__CURSOR="); - if (value) - /* ignore __CURSOR */ + if (STARTSWITH_SET(line, "__CURSOR=", "__SEQNUM=", "__SEQNUM_ID=")) + /* ignore __CURSOR=, __SEQNUM=, __SEQNUM_ID= which we cannot replicate */ return 1; value = startswith(line, "__REALTIME_TIMESTAMP="); diff --git a/src/shared/logs-show.c b/src/shared/logs-show.c index b72e516c8da..6ed9a6890f5 100644 --- a/src/shared/logs-show.c +++ b/src/shared/logs-show.c @@ -791,11 +791,12 @@ static int output_export( const dual_timestamp *previous_display_ts, const sd_id128_t *previous_boot_id) { + sd_id128_t journal_boot_id, seqnum_id; _cleanup_free_ char *cursor = NULL; - const void *data; - size_t length; usec_t monotonic, realtime; - sd_id128_t journal_boot_id; + const void *data; + uint64_t seqnum; + size_t length; int r; assert(j); @@ -818,14 +819,22 @@ static int output_export( if (r < 0) return log_error_errno(r, "Failed to get monotonic timestamp: %m"); + r = sd_journal_get_seqnum(j, &seqnum, &seqnum_id); + if (r < 0) + return log_error_errno(r, "Failed to get seqnum: %m"); + fprintf(f, "__CURSOR=%s\n" - "__REALTIME_TIMESTAMP="USEC_FMT"\n" - "__MONOTONIC_TIMESTAMP="USEC_FMT"\n" + "__REALTIME_TIMESTAMP=" USEC_FMT "\n" + "__MONOTONIC_TIMESTAMP=" USEC_FMT "\n" + "__SEQNUM=%" PRIu64 "\n" + "__SEQNUM_ID=%s\n" "_BOOT_ID=%s\n", cursor, realtime, monotonic, + seqnum, + SD_ID128_TO_STRING(seqnum_id), SD_ID128_TO_STRING(journal_boot_id)); JOURNAL_FOREACH_DATA_RETVAL(j, data, length, r) { @@ -1040,15 +1049,16 @@ static int output_json( const dual_timestamp *previous_display_ts, const sd_id128_t *previous_boot_id) { - char usecbuf[DECIMAL_STR_MAX(usec_t)]; + char usecbuf[CONST_MAX(DECIMAL_STR_MAX(usec_t), DECIMAL_STR_MAX(uint64_t))]; _cleanup_(json_variant_unrefp) JsonVariant *object = NULL; + sd_id128_t journal_boot_id, seqnum_id; _cleanup_free_ char *cursor = NULL; + usec_t realtime, monotonic; JsonVariant **array = NULL; struct json_data *d; Hashmap *h = NULL; + uint64_t seqnum; size_t n = 0; - usec_t realtime, monotonic; - sd_id128_t journal_boot_id; int r; assert(j); @@ -1071,6 +1081,10 @@ static int output_json( if (r < 0) return log_error_errno(r, "Failed to get monotonic timestamp: %m"); + r = sd_journal_get_seqnum(j, &seqnum, &seqnum_id); + if (r < 0) + return log_error_errno(r, "Failed to get seqnum: %m"); + h = hashmap_new(&string_hash_ops); if (!h) return log_oom(); @@ -1093,6 +1107,15 @@ static int output_json( if (r < 0) goto finish; + xsprintf(usecbuf, USEC_FMT, seqnum); + r = update_json_data(h, flags, "__SEQNUM", usecbuf, SIZE_MAX); + if (r < 0) + goto finish; + + r = update_json_data(h, flags, "__SEQNUM_ID", SD_ID128_TO_STRING(seqnum_id), SIZE_MAX); + if (r < 0) + goto finish; + for (;;) { const void *data; size_t size; diff --git a/test/units/testsuite-04.sh b/test/units/testsuite-04.sh index d10a9afbf7c..5d27df91fbe 100755 --- a/test/units/testsuite-04.sh +++ b/test/units/testsuite-04.sh @@ -70,7 +70,7 @@ ID=$(journalctl --new-id128 | sed -n 2p) printf $'foo' | systemd-cat -t "$ID" --level-prefix false journalctl --sync journalctl -b -o export --output-fields=MESSAGE,FOO --output-fields=PRIORITY,MESSAGE -t "$ID" >/output -[[ $(grep -c . /output) -eq 6 ]] +[[ $(grep -c . /output) -eq 8 ]] grep -q '^__CURSOR=' /output grep -q '^MESSAGE=foo$' /output grep -q '^PRIORITY=6$' /output From e5d60d1b3b0c0b229571823c7fdd390a9562982e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 23 Jan 2023 21:21:21 +0100 Subject: [PATCH 3/6] journald: maintain entry seqnum counter in mmap()ed file in /run/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's ensure that entry seqnums remain stable and monotonic across the entire runtime of the system, even if local storage is turned off. Let's do this by maintainer a counter file in /run/ which we mmap() and wherein we maintain the counter from early-boot on till late shutdown. This takes inspiration of the kernel-seqnum file we already maintain like that that tracks which kmsg messages we already processed. In fact, we reuse the same code for maintaining it. This should allow the behaviour entry seqnums to be more predictable, in particular when journal local storage is turned off. Previously, we'd maintain the seqnum simply by always bumping it to the maximum of the last written entry seqnum plus one, and the biggest seqnum so far written to the journal file on disk. If we'd never write a file on disk, or if no journal file was existing during the initrd→seqnum transition we'd completely lose the current seqnum position during daemon restarts (such as the one happening during the switch-root operation). This also will cause a journal file rotation whenever we try to write to a journal file with multiple sequence number IDs, so that we know that from early boot trhough the entire runtime we'll have stable sequence numbers that do not jump, and thus can be used to determine "lost" messages. --- src/journal-remote/journal-remote-write.c | 25 ++++-- src/journal/journald-kmsg.c | 26 +----- src/journal/journald-server.c | 105 +++++++++++++++++++--- src/journal/journald-server.h | 12 ++- src/journal/test-journal-flush.c | 2 +- src/journal/test-journal-interleaving.c | 2 +- src/journal/test-journal-stream.c | 6 +- src/journal/test-journal-verify.c | 2 +- src/journal/test-journal.c | 8 +- src/libsystemd/sd-journal/journal-file.c | 30 ++++++- src/libsystemd/sd-journal/journal-file.h | 5 +- 11 files changed, 168 insertions(+), 55 deletions(-) diff --git a/src/journal-remote/journal-remote-write.c b/src/journal-remote/journal-remote-write.c index f6c3d83732b..ace4a1cfadd 100644 --- a/src/journal-remote/journal-remote-write.c +++ b/src/journal-remote/journal-remote-write.c @@ -99,9 +99,16 @@ int writer_write(Writer *w, return r; } - r = journal_file_append_entry(w->journal->file, ts, boot_id, - iovw->iovec, iovw->count, - &w->seqnum, NULL, NULL); + r = journal_file_append_entry( + w->journal->file, + ts, + boot_id, + iovw->iovec, + iovw->count, + &w->seqnum, + /* seqnum_id= */ NULL, + /* ret_object= */ NULL, + /* ret_offset= */ NULL); if (r >= 0) { if (w->server) w->server->event_count += 1; @@ -120,9 +127,15 @@ int writer_write(Writer *w, return r; log_debug("Retrying write."); - r = journal_file_append_entry(w->journal->file, ts, boot_id, - iovw->iovec, iovw->count, - &w->seqnum, NULL, NULL); + r = journal_file_append_entry( + w->journal->file, + ts, + boot_id, + iovw->iovec, iovw->count, + &w->seqnum, + /* seqnum_id= */ NULL, + /* ret_object= */ NULL, + /* ret_offset= */ NULL); if (r < 0) return r; diff --git a/src/journal/journald-kmsg.c b/src/journal/journald-kmsg.c index c64e626b2cd..83082c4e94f 100644 --- a/src/journal/journald-kmsg.c +++ b/src/journal/journald-kmsg.c @@ -423,9 +423,6 @@ finish: } int server_open_kernel_seqnum(Server *s) { - _cleanup_close_ int fd = -EBADF; - const char *fn; - uint64_t *p; int r; assert(s); @@ -436,26 +433,9 @@ int server_open_kernel_seqnum(Server *s) { if (!s->dev_kmsg_readable) return 0; - fn = strjoina(s->runtime_directory, "/kernel-seqnum"); - fd = open(fn, O_RDWR|O_CREAT|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644); - if (fd < 0) { - log_error_errno(errno, "Failed to open %s, ignoring: %m", fn); - return 0; - } - - r = posix_fallocate_loop(fd, 0, sizeof(uint64_t)); - if (r < 0) { - log_error_errno(r, "Failed to allocate sequential number file, ignoring: %m"); - return 0; - } - - p = mmap(NULL, sizeof(uint64_t), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); - if (p == MAP_FAILED) { - log_error_errno(errno, "Failed to map sequential number file, ignoring: %m"); - return 0; - } - - s->kernel_seqnum = p; + r = server_map_seqnum_file(s, "kernel-seqnum", sizeof(uint64_t), (void**) &s->kernel_seqnum); + if (r < 0) + return log_error_errno(r, "Failed to map kernel seqnum file: %m"); return 0; } diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c index 022e12d83df..f88365e2684 100644 --- a/src/journal/journald-server.c +++ b/src/journal/journald-server.c @@ -889,10 +889,12 @@ static bool shall_try_append_again(JournalFile *f, int r) { log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, "%s: Montonic clock jumped backwards relative to last journal entry, rotating.", f->path); return true; + case -EILSEQ: /* seqnum ID last used in the file doesn't match the one we'd passed when writing an entry to it */ + log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, "%s: Journal file uses a different sequence number ID, rotating.", f->path); + return true; + case -EAFNOSUPPORT: - log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, - "%s: underlying file system does not support memory mapping or another required file system feature.", - f->path); + log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, "%s: underlying file system does not support memory mapping or another required file system feature.", f->path); return false; default: @@ -900,7 +902,13 @@ static bool shall_try_append_again(JournalFile *f, int r) { } } -static void server_write_to_journal(Server *s, uid_t uid, struct iovec *iovec, size_t n, int priority) { +static void server_write_to_journal( + Server *s, + uid_t uid, + const struct iovec *iovec, + size_t n, + int priority) { + bool vacuumed = false, rotate = false; struct dual_timestamp ts; ManagedJournalFile *f; @@ -950,7 +958,15 @@ static void server_write_to_journal(Server *s, uid_t uid, struct iovec *iovec, s s->last_realtime_clock = ts.realtime; - r = journal_file_append_entry(f->file, &ts, NULL, iovec, n, &s->seqnum, NULL, NULL); + r = journal_file_append_entry( + f->file, + &ts, + /* boot_id= */ NULL, + iovec, n, + &s->seqnum->seqnum, + &s->seqnum->id, + /* ret_object= */ NULL, + /* ret_offset= */ NULL); if (r >= 0) { server_schedule_sync(s, priority); return; @@ -978,7 +994,15 @@ static void server_write_to_journal(Server *s, uid_t uid, struct iovec *iovec, s return; log_debug_errno(r, "Retrying write."); - r = journal_file_append_entry(f->file, &ts, NULL, iovec, n, &s->seqnum, NULL, NULL); + r = journal_file_append_entry( + f->file, + &ts, + /* boot_id= */ NULL, + iovec, n, + &s->seqnum->seqnum, + &s->seqnum->id, + /* ret_object= */ NULL, + /* ret_offset= */ NULL); if (r < 0) log_ratelimit_error_errno(r, FAILED_TO_WRITE_ENTRY_RATELIMIT, "Failed to write entry to %s (%zu items, %zu bytes) despite vacuuming, ignoring: %m", @@ -1290,7 +1314,13 @@ int server_flush_to_var(Server *s, bool require_flag_file) { goto finish; } - r = journal_file_copy_entry(f, s->system_journal->file, o, f->current_offset); + r = journal_file_copy_entry( + f, + s->system_journal->file, + o, + f->current_offset, + &s->seqnum->seqnum, + &s->seqnum->id); if (r >= 0) continue; @@ -1312,7 +1342,13 @@ int server_flush_to_var(Server *s, bool require_flag_file) { } log_debug("Retrying write."); - r = journal_file_copy_entry(f, s->system_journal->file, o, f->current_offset); + r = journal_file_copy_entry( + f, + s->system_journal->file, + o, + f->current_offset, + &s->seqnum->seqnum, + &s->seqnum->id); if (r < 0) { log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "Can't write entry: %m"); goto finish; @@ -2250,6 +2286,51 @@ static int server_open_varlink(Server *s, const char *socket, int fd) { return 0; } +int server_map_seqnum_file( + Server *s, + const char *fname, + size_t size, + void **ret) { + + _cleanup_free_ char *fn = NULL; + _cleanup_close_ int fd = -EBADF; + uint64_t *p; + int r; + + assert(s); + assert(fname); + assert(size > 0); + assert(ret); + + fn = path_join(s->runtime_directory, fname); + if (!fn) + return -ENOMEM; + + fd = open(fn, O_RDWR|O_CREAT|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644); + if (fd < 0) + return -errno; + + r = posix_fallocate_loop(fd, 0, size); + if (r < 0) + return r; + + p = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + return -errno; + + *ret = p; + return 0; +} + +void server_unmap_seqnum_file(void *p, size_t size) { + assert(size > 0); + + if (!p) + return; + + assert_se(munmap(p, size) >= 0); +} + static bool server_is_idle(Server *s) { assert(s); @@ -2562,6 +2643,10 @@ int server_init(Server *s, const char *namespace) { if (r < 0) return r; + r = server_map_seqnum_file(s, "seqnum", sizeof(SeqnumData), (void**) &s->seqnum); + if (r < 0) + return log_error_errno(r, "Failed to map main seqnum file: %m"); + r = server_open_kernel_seqnum(s); if (r < 0) return r; @@ -2678,8 +2763,8 @@ void server_done(Server *s) { if (s->ratelimit) journal_ratelimit_free(s->ratelimit); - if (s->kernel_seqnum) - munmap(s->kernel_seqnum, sizeof(uint64_t)); + server_unmap_seqnum_file(s->seqnum, sizeof(*s->seqnum)); + server_unmap_seqnum_file(s->kernel_seqnum, sizeof(*s->kernel_seqnum)); free(s->buffer); free(s->tty_path); diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h index ee8f3741908..e7bf750a59c 100644 --- a/src/journal/journald-server.h +++ b/src/journal/journald-server.h @@ -60,6 +60,13 @@ typedef struct JournalStorage { JournalStorageSpace space; } JournalStorage; +/* This structure will be kept in $RUNTIME_DIRECTORY/seqnum and is mapped by journald, and is used to + * maintain the sequence number counter with its seqnum ID */ +typedef struct SeqnumData { + sd_id128_t id; + uint64_t seqnum; +} SeqnumData; + struct Server { char *namespace; @@ -93,7 +100,7 @@ struct Server { ManagedJournalFile *system_journal; OrderedHashmap *user_journals; - uint64_t seqnum; + SeqnumData *seqnum; char *buffer; @@ -227,3 +234,6 @@ void server_space_usage_message(Server *s, JournalStorage *storage); int server_start_or_stop_idle_timer(Server *s); int server_refresh_idle_timer(Server *s); + +int server_map_seqnum_file(Server *s, const char *fname, size_t size, void **ret); +void server_unmap_seqnum_file(void *p, size_t size); diff --git a/src/journal/test-journal-flush.c b/src/journal/test-journal-flush.c index 53e479909e7..f2fe0e51c00 100644 --- a/src/journal/test-journal-flush.c +++ b/src/journal/test-journal-flush.c @@ -53,7 +53,7 @@ static void test_journal_flush(int argc, char *argv[]) { log_error_errno(r, "journal_file_move_to_object failed: %m"); assert_se(r >= 0); - r = journal_file_copy_entry(f, new_journal->file, o, f->current_offset); + r = journal_file_copy_entry(f, new_journal->file, o, f->current_offset, NULL, NULL); if (r < 0) log_warning_errno(r, "journal_file_copy_entry failed: %m"); assert_se(r >= 0 || diff --git a/src/journal/test-journal-interleaving.c b/src/journal/test-journal-interleaving.c index 67552de8fe8..55d717da316 100644 --- a/src/journal/test-journal-interleaving.c +++ b/src/journal/test-journal-interleaving.c @@ -66,7 +66,7 @@ static void append_number(ManagedJournalFile *f, int n, uint64_t *seqnum) { assert_se(asprintf(&p, "NUMBER=%d", n) >= 0); iovec[0] = IOVEC_MAKE_STRING(p); - assert_ret(journal_file_append_entry(f->file, &ts, NULL, iovec, 1, seqnum, NULL, NULL)); + assert_ret(journal_file_append_entry(f->file, &ts, NULL, iovec, 1, seqnum, NULL, NULL, NULL)); free(p); } diff --git a/src/journal/test-journal-stream.c b/src/journal/test-journal-stream.c index 9f4494c6c0a..940625a0843 100644 --- a/src/journal/test-journal-stream.c +++ b/src/journal/test-journal-stream.c @@ -103,12 +103,12 @@ static void run_test(void) { iovec[1] = IOVEC_MAKE(q, strlen(q)); if (i % 10 == 0) - assert_se(journal_file_append_entry(three->file, &ts, NULL, iovec, 2, NULL, NULL, NULL) == 0); + assert_se(journal_file_append_entry(three->file, &ts, NULL, iovec, 2, NULL, NULL, NULL, NULL) == 0); else { if (i % 3 == 0) - assert_se(journal_file_append_entry(two->file, &ts, NULL, iovec, 2, NULL, NULL, NULL) == 0); + assert_se(journal_file_append_entry(two->file, &ts, NULL, iovec, 2, NULL, NULL, NULL, NULL) == 0); - assert_se(journal_file_append_entry(one->file, &ts, NULL, iovec, 2, NULL, NULL, NULL) == 0); + assert_se(journal_file_append_entry(one->file, &ts, NULL, iovec, 2, NULL, NULL, NULL, NULL) == 0); } free(p); diff --git a/src/journal/test-journal-verify.c b/src/journal/test-journal-verify.c index 0c58d05ced3..7a90079fc8e 100644 --- a/src/journal/test-journal-verify.c +++ b/src/journal/test-journal-verify.c @@ -94,7 +94,7 @@ static int run_test(int argc, char *argv[]) { iovec = IOVEC_MAKE_STRING(test); - assert_se(journal_file_append_entry(df->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0); + assert_se(journal_file_append_entry(df->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL, NULL) == 0); free(test); } diff --git a/src/journal/test-journal.c b/src/journal/test-journal.c index 889673cae77..cd295b2a7e5 100644 --- a/src/journal/test-journal.c +++ b/src/journal/test-journal.c @@ -45,13 +45,13 @@ static void test_non_empty_one(void) { assert_se(sd_id128_randomize(&fake_boot_id) == 0); iovec = IOVEC_MAKE_STRING(test); - assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0); + assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL, NULL) == 0); iovec = IOVEC_MAKE_STRING(test2); - assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0); + assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL, NULL) == 0); iovec = IOVEC_MAKE_STRING(test); - assert_se(journal_file_append_entry(f->file, &ts, &fake_boot_id, &iovec, 1, NULL, NULL, NULL) == 0); + assert_se(journal_file_append_entry(f->file, &ts, &fake_boot_id, &iovec, 1, NULL, NULL, NULL, NULL) == 0); #if HAVE_GCRYPT journal_file_append_tag(f->file); @@ -199,7 +199,7 @@ static bool check_compressed(uint64_t compress_threshold, uint64_t data_size) { dual_timestamp_get(&ts); iovec = IOVEC_MAKE(data, data_size); - assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0); + assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL, NULL) == 0); #if HAVE_GCRYPT journal_file_append_tag(f->file); diff --git a/src/libsystemd/sd-journal/journal-file.c b/src/libsystemd/sd-journal/journal-file.c index 2ead29548b3..aab33dbfcca 100644 --- a/src/libsystemd/sd-journal/journal-file.c +++ b/src/libsystemd/sd-journal/journal-file.c @@ -2094,6 +2094,7 @@ static int journal_file_append_entry_internal( const EntryItem items[], size_t n_items, uint64_t *seqnum, + sd_id128_t *seqnum_id, Object **ret_object, uint64_t *ret_offset) { @@ -2139,6 +2140,21 @@ static int journal_file_append_entry_internal( } } + if (seqnum_id) { + /* Settle the passed in sequence number ID */ + + if (sd_id128_is_null(*seqnum_id)) + *seqnum_id = f->header->seqnum_id; /* Caller has none assigned, then copy the one from the file */ + else if (!sd_id128_equal(*seqnum_id, f->header->seqnum_id)) { + /* Different seqnum IDs? We can't allow entries from multiple IDs end up in the same journal.*/ + if (le64toh(f->header->n_entries) == 0) + f->header->seqnum_id = *seqnum_id; /* Caller has one, and file so far has no entries, then copy the one from the caller */ + else + return log_debug_errno(SYNTHETIC_ERRNO(EILSEQ), + "Sequence number IDs don't match, refusing entry."); + } + } + osize = offsetof(Object, entry.items) + (n_items * journal_file_entry_item_size(f)); r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np); @@ -2290,6 +2306,7 @@ int journal_file_append_entry( const struct iovec iovec[], size_t n_iovec, uint64_t *seqnum, + sd_id128_t *seqnum_id, Object **ret_object, uint64_t *ret_offset) { @@ -2376,7 +2393,7 @@ int journal_file_append_entry( typesafe_qsort(items, n_iovec, entry_item_cmp); n_iovec = remove_duplicate_entry_items(items, n_iovec); - r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret_object, ret_offset); + r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, seqnum_id, ret_object, ret_offset); /* If the memory mapping triggered a SIGBUS then we return an * IO error and ignore the error code passed down to us, since @@ -4093,7 +4110,14 @@ int journal_file_dispose(int dir_fd, const char *fname) { return 0; } -int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) { +int journal_file_copy_entry( + JournalFile *from, + JournalFile *to, + Object *o, + uint64_t p, + uint64_t *seqnum, + sd_id128_t *seqnum_id) { + _cleanup_free_ EntryItem *items_alloc = NULL; EntryItem *items; uint64_t q, n, xor_hash = 0; @@ -4168,7 +4192,7 @@ int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint6 return r; } - r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n, NULL, NULL, NULL); + r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n, seqnum, seqnum_id, NULL, NULL); if (mmap_cache_fd_got_sigbus(to->cache_fd)) return -EIO; diff --git a/src/libsystemd/sd-journal/journal-file.h b/src/libsystemd/sd-journal/journal-file.h index 8c809ed4b99..07f1f5d1806 100644 --- a/src/libsystemd/sd-journal/journal-file.h +++ b/src/libsystemd/sd-journal/journal-file.h @@ -259,7 +259,8 @@ int journal_file_append_entry( const sd_id128_t *boot_id, const struct iovec iovec[], size_t n_iovec, - uint64_t *seqno, + uint64_t *seqnum, + sd_id128_t *seqnum_id, Object **ret_object, uint64_t *ret_offset); @@ -286,7 +287,7 @@ int journal_file_move_to_entry_by_seqnum_for_data(JournalFile *f, Object *d, uin int journal_file_move_to_entry_by_realtime_for_data(JournalFile *f, Object *d, uint64_t realtime, direction_t direction, Object **ret_object, uint64_t *ret_offset); int journal_file_move_to_entry_by_monotonic_for_data(JournalFile *f, Object *d, sd_id128_t boot_id, uint64_t monotonic, direction_t direction, Object **ret_object, uint64_t *ret_offset); -int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p); +int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, sd_id128_t *seqnum_id); void journal_file_dump(JournalFile *f); void journal_file_print_header(JournalFile *f); From 37a40afcfe2b446c84d21e3b769b0d40993b0b81 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 26 Jan 2023 11:04:44 +0100 Subject: [PATCH 4/6] man: document the new sd_journal_get_seqnum() call --- man/rules/meson.build | 1 + man/sd_journal_get_realtime_usec.xml | 1 + man/sd_journal_get_seqnum.xml | 105 +++++++++++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 man/sd_journal_get_seqnum.xml diff --git a/man/rules/meson.build b/man/rules/meson.build index c5b457bd7f9..dcae4442eaa 100644 --- a/man/rules/meson.build +++ b/man/rules/meson.build @@ -720,6 +720,7 @@ manpages = [ 'sd_journal_wait'], ''], ['sd_journal_get_realtime_usec', '3', ['sd_journal_get_monotonic_usec'], ''], + ['sd_journal_get_seqnum', '3', [], ''], ['sd_journal_get_usage', '3', [], ''], ['sd_journal_has_runtime_files', '3', ['sd_journal_has_persistent_files'], ''], ['sd_journal_next', diff --git a/man/sd_journal_get_realtime_usec.xml b/man/sd_journal_get_realtime_usec.xml index 025b6a1a5aa..f5096f8db8f 100644 --- a/man/sd_journal_get_realtime_usec.xml +++ b/man/sd_journal_get_realtime_usec.xml @@ -103,6 +103,7 @@ sd_journal_open3, sd_journal_next3, sd_journal_get_data3, + sd_journal_get_seqnum3, sd_id128_get_boot3, clock_gettime2, sd_journal_get_cutoff_realtime_usec3 diff --git a/man/sd_journal_get_seqnum.xml b/man/sd_journal_get_seqnum.xml new file mode 100644 index 00000000000..732f6607633 --- /dev/null +++ b/man/sd_journal_get_seqnum.xml @@ -0,0 +1,105 @@ + + + + + + + + sd_journal_get_seqnum + systemd + + + + sd_journal_get_seqnum + 3 + + + + sd_journal_get_seqnum + Read sequence number from the current journal entry + + + + + #include <systemd/sd-journal.h> + + + int sd_journal_get_seqnum + sd_journal *j + uint64_t *ret_seqnum + sd_id128_t *ret_seqnum_id + + + + + + Description + + sd_journal_get_seqnum() returns the sequence number of the current journal + entry. It takes three arguments: the journal context object, a pointer to a 64-bit unsigned integer to + store the sequence number in, and a buffer to return the 128bit sequence number ID in. + + When writing journal entries to disk each systemd-journald instance will number + them sequentially, starting from 1 for the first entry written after subsystem intialization. Each such + series of sequence numbers is associated with a 128bit sequence number ID which is initialized randomly, + once at systemd-journal initialization. Thus, while multiple instances of + systemd-journald will assign the same sequence numbers to their written journal + entries, they will have a distinct sequence number IDs. The sequence number is assigned at the moment of + writing the entry to disk. If log entries are rewritten (for example because the volatile logs from + /run/log/ are flushed to /var/log/ via + systemd-journald-flush.service) they will get new sequence numbers assigned. + + Sequence numbers may be used to order entries (entries associated with the same sequence number ID + and lower sequence numbers should be ordered chronologically before those with higher sequence numbers), + and to detect lost entries. Note that journal service instances typically write to multiple journal files + in parallel (for example because SplitMode= is used), in which case each journal file + will only contain a subset of the sequence numbers. To recover the full stream of journal entries the + files must be combined ("interleaved"), a process that primarily relies on the sequence numbers. When + journal files are rotated (due to size or time limits), the series of sequence numbers is continued in + the replacement files. All journal files generated from the same journal instance will carry the same + sequence number ID. + + As the sequence numbers are assigned at the moment of writing the journal entries to disk they do + not exist if storage is disabled via SplitMode=. + + The ret_seqnum and ret_seqnum_id parameters may be specified + as NULL in which case the relevant data is not returned (but the call will otherwise + succeed). + + Note that these functions will not work before + sd_journal_next3 + (or related call) has been called at least + once, in order to position the read pointer at a valid entry. + + + + Return Value + + sd_journal_get_seqnum() returns 0 on success or a negative errno-style error + code.. + + + + Notes + + + + + + + + See Also + + + systemd1, + sd-journal3, + sd_journal_open3, + sd_journal_next3, + sd_journal_get_data3, + sd_journal_get_monotonic_usec3 + + + + From 11181f8a5a1e26167b9f09f3f0ed398750cc6bd7 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 26 Jan 2023 11:16:51 +0100 Subject: [PATCH 5/6] man: document __SEQNUM=/__SEQNUM_ID= journal pseudo fields --- docs/JOURNAL_EXPORT_FORMATS.md | 4 ++-- man/systemd.journal-fields.xml | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/JOURNAL_EXPORT_FORMATS.md b/docs/JOURNAL_EXPORT_FORMATS.md index 037db94f9de..9db2726fc26 100644 --- a/docs/JOURNAL_EXPORT_FORMATS.md +++ b/docs/JOURNAL_EXPORT_FORMATS.md @@ -20,7 +20,7 @@ When exporting journal data for other uses or transferring it via the network/lo * Two journal entries that follow each other are separated by a double newline. * Journal fields consisting only of valid non-control UTF-8 codepoints are serialized as they are (i.e. the field name, followed by '=', followed by field data), followed by a newline as separator to the next field. Note that fields containing newlines cannot be formatted like this. Non-control UTF-8 codepoints are the codepoints with value at or above 32 (' '), or equal to 9 (TAB). * Other journal fields are serialized in a special binary safe way: field name, followed by newline, followed by a binary 64bit little endian size value, followed by the binary field data, followed by a newline as separator to the next field. -* Entry metadata that is not actually a field is serialized like it was a field, but beginning with two underscores. More specifically, `__CURSOR=`, `__REALTIME_TIMESTAMP=`, `__MONOTONIC_TIMESTAMP=` are introduced this way. Note that these meta-fields are only generated when actual journal files are serialized. They are omitted for entries that do not originate from a journal file (for example because they are transferred for the first time to be stored in one). Or in other words: if you are generating this format you shouldn't care about these special double-underscore fields. But you might find them usable when you deserialize the format generated by us. Additional fields prefixed with two underscores might be added later on, your parser should skip over the fields it does not know. +* Entry metadata that is not actually a field is serialized like it was a field, but beginning with two underscores. More specifically, `__CURSOR=`, `__REALTIME_TIMESTAMP=`, `__MONOTONIC_TIMESTAMP=`, `__SEQNUM=`, `__SEQNUM_ID` are introduced this way. Note that these meta-fields are only generated when actual journal files are serialized. They are omitted for entries that do not originate from a journal file (for example because they are transferred for the first time to be stored in one). Or in other words: if you are generating this format you shouldn't care about these special double-underscore fields. But you might find them usable when you deserialize the format generated by us. Additional fields prefixed with two underscores might be added later on, your parser should skip over the fields it does not know. * The order in which fields appear in an entry is undefined and might be different for each entry that is serialized. And that's already it. @@ -133,7 +133,7 @@ In most cases the Journal JSON serialization is the obvious mapping of the entry * A field that contains non-printable or non-UTF8 is serialized as a number array instead. This is necessary to handle binary data in a safe way without losing data, since JSON cannot embed binary data natively. Each byte of the binary field will be mapped to its numeric value in the range 0…255. * The JSON serializer can optionally skip huge (as in larger than a specific threshold) data fields from the JSON object. If that is enabled and a data field is too large, the field name is still included in the JSON object but assigned _null_. * Within the same entry, Journal fields may have multiple values assigned. This is not allowed in JSON. The serializer will hence create a single JSON field only for these cases, and assign it an array of values (which the can be strings, _null_ or number arrays, see above). -* If the JSON data originates from a journal file it may include the special addressing fields `__CURSOR`, `__REALTIME_TIMESTAMP`, `__MONOTONIC_TIMESTAMP`, which contain the cursor string of this entry as string, and the realtime/monotonic timestamps of this entry as formatted numeric string of usec since the respective epoch. +* If the JSON data originates from a journal file it may include the special addressing fields `__CURSOR`, `__REALTIME_TIMESTAMP`, `__MONOTONIC_TIMESTAMP`, `__SEQNUM`, `__SEQNUM_ID`, which contain the cursor string of this entry as string, the realtime/monotonic timestamps of this entry as formatted numeric string of usec since the respective epoch, and the sequence number and associated sequence number ID, both formatted as strings. Here's an example, illustrating all cases mentioned above. Consider this entry: diff --git a/man/systemd.journal-fields.xml b/man/systemd.journal-fields.xml index d9e3d5760ce..9eaa4ce1fc6 100644 --- a/man/systemd.journal-fields.xml +++ b/man/systemd.journal-fields.xml @@ -596,6 +596,16 @@ + + + __SEQNUM= + __SEQNUM_ID= + + The sequence number (and associated sequence number ID) of this journal entry in the + journal file it originates from. See + sd_journal_get_seqnum3 + for details. + From f28ed2c1be77c2aab7ebf488ac095cd796294172 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 31 Jan 2023 14:40:03 +0100 Subject: [PATCH 6/6] test: add basic seqnum test --- test/units/testsuite-04.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/units/testsuite-04.sh b/test/units/testsuite-04.sh index 5d27df91fbe..419d00bedfd 100755 --- a/test/units/testsuite-04.sh +++ b/test/units/testsuite-04.sh @@ -259,4 +259,13 @@ if is_xattr_supported; then rm -rf /etc/systemd/system/logs-filtering.service.d fi +# Check that the seqnum field at least superficially works +systemd-cat echo "ya" +journalctl --sync +SEQNUM1=$(journalctl -o export -n 1 | grep -a __SEQNUM= | cut -d= -f2) +systemd-cat echo "yo" +journalctl --sync +SEQNUM2=$(journalctl -o export -n 1 | grep -a __SEQNUM= | cut -d= -f2) +test "$SEQNUM2" -gt "$SEQNUM1" + touch /testok