From 16e9f408fa9a9626059bdd6c89dc175e06b9e976 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 14 Aug 2012 22:02:24 +0200 Subject: [PATCH] journal: implement generic sharable mmap caching logic instead of having one simple per-file cache implement an more comprehensive one that works for multiple files and can actually maintain multiple maps per file and per object type. --- Makefile.am | 21 +- src/journal/journal-file.c | 171 +++------ src/journal/journal-file.h | 24 +- src/journal/journal-internal.h | 2 +- src/journal/journald.c | 15 +- src/journal/journald.h | 2 + src/journal/mmap-cache.c | 577 ++++++++++++++++++++++++++++++ src/journal/mmap-cache.h | 34 ++ src/journal/sd-journal.c | 16 +- src/journal/test-journal-stream.c | 6 +- src/journal/test-journal.c | 2 +- 11 files changed, 716 insertions(+), 154 deletions(-) create mode 100644 src/journal/mmap-cache.c create mode 100644 src/journal/mmap-cache.h diff --git a/Makefile.am b/Makefile.am index 837bc6c5b28..c90867c8da3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2308,17 +2308,9 @@ EXTRA_DIST += \ systemd_journald_SOURCES = \ src/journal/journald.c \ src/journal/journald.h \ - src/journal/sd-journal.c \ - src/journal/journal-file.c \ - src/journal/journal-file.h \ - src/journal/lookup3.c \ - src/journal/lookup3.h \ src/journal/journal-rate-limit.c \ src/journal/journal-rate-limit.h \ - src/journal/sparse-endian.h \ - src/journal/journal-def.h \ - src/journal/journal-internal.h \ - src/journal/compress.h + src/journal/journal-internal.h nodist_systemd_journald_SOURCES = \ src/journal/journald-gperf.c @@ -2394,9 +2386,18 @@ test_journal_stream_LDADD = \ libsystemd_journal_la_SOURCES = \ src/journal/sd-journal.c \ + src/systemd/sd-journal.h \ src/journal/journal-file.c \ + src/journal/journal-file.h \ src/journal/lookup3.c \ - src/journal/journal-send.c + src/journal/lookup3.h \ + src/journal/journal-send.c \ + src/journal/journal-send.h \ + src/journal/sparse-endian.h \ + src/journal/journal-def.h \ + src/journal/compress.h \ + src/journal/mmap-cache.c \ + src/journal/mmap-cache.h libsystemd_journal_la_CFLAGS = \ $(AM_CFLAGS) \ diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c index 9235e5fea2d..1f5e04d791c 100644 --- a/src/journal/journal-file.c +++ b/src/journal/journal-file.c @@ -36,8 +36,6 @@ #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem)) #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem)) -#define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL) - #define COMPRESSION_SIZE_THRESHOLD (512ULL) /* This is the minimum journal file size */ @@ -71,8 +69,6 @@ static int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime); static int journal_file_hmac_put_object(JournalFile *f, int type, uint64_t p); void journal_file_close(JournalFile *f) { - int t; - assert(f); /* Write the final tag */ @@ -80,9 +76,8 @@ void journal_file_close(JournalFile *f) { journal_file_append_tag(f); /* Sync everything to disk, before we mark the file offline */ - for (t = 0; t < _WINDOW_MAX; t++) - if (f->windows[t].ptr) - munmap(f->windows[t].ptr, f->windows[t].size); + if (f->mmap && f->fd >= 0) + mmap_cache_close_fd(f->mmap, f->fd); if (f->writable && f->fd >= 0) fdatasync(f->fd); @@ -100,6 +95,9 @@ void journal_file_close(JournalFile *f) { free(f->path); + if (f->mmap) + mmap_cache_unref(f->mmap); + #ifdef HAVE_XZ free(f->compress_buffer); #endif @@ -305,59 +303,11 @@ static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) return 0; } -static int journal_file_map( - JournalFile *f, - uint64_t offset, - uint64_t size, - void **_window, - uint64_t *_woffset, - uint64_t *_wsize, - void **ret) { - - uint64_t woffset, wsize; - void *window; - +static int journal_file_move_to(JournalFile *f, int context, uint64_t offset, uint64_t size, void **ret) { assert(f); - assert(size > 0); assert(ret); - woffset = offset & ~((uint64_t) page_size() - 1ULL); - wsize = size + (offset - woffset); - wsize = PAGE_ALIGN(wsize); - /* Avoid SIGBUS on invalid accesses */ - if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size)) - return -EADDRNOTAVAIL; - - window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset); - if (window == MAP_FAILED) - return -errno; - - if (_window) - *_window = window; - - if (_woffset) - *_woffset = woffset; - - if (_wsize) - *_wsize = wsize; - - *ret = (uint8_t*) window + (offset - woffset); - - return 0; -} - -static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) { - void *p = NULL; - uint64_t delta; - int r; - Window *w; - - assert(f); - assert(ret); - assert(wt >= 0); - assert(wt < _WINDOW_MAX); - if (offset + size > (uint64_t) f->last_stat.st_size) { /* Hmm, out of range? Let's refresh the fstat() data * first, before we trust that check. */ @@ -367,57 +317,7 @@ static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_ return -EADDRNOTAVAIL; } - w = f->windows + wt; - - if (_likely_(w->ptr && - w->offset <= offset && - w->offset + w->size >= offset + size)) { - - *ret = (uint8_t*) w->ptr + (offset - w->offset); - return 0; - } - - if (w->ptr) { - if (munmap(w->ptr, w->size) < 0) - return -errno; - - w->ptr = NULL; - w->size = w->offset = 0; - } - - if (size < DEFAULT_WINDOW_SIZE) { - /* If the default window size is larger then what was - * asked for extend the mapping a bit in the hope to - * minimize needed remappings later on. We add half - * the window space before and half behind the - * requested mapping */ - - delta = (DEFAULT_WINDOW_SIZE - size) / 2; - - if (delta > offset) - delta = offset; - - offset -= delta; - size = DEFAULT_WINDOW_SIZE; - } else - delta = 0; - - if (offset + size > (uint64_t) f->last_stat.st_size) - size = (uint64_t) f->last_stat.st_size - offset; - - if (size <= 0) - return -EADDRNOTAVAIL; - - r = journal_file_map(f, - offset, size, - &w->ptr, &w->offset, &w->size, - &p); - - if (r < 0) - return r; - - *ret = (uint8_t*) p + delta; - return 0; + return mmap_cache_get(f->mmap, f->fd, f->prot, context, offset, size, ret); } static bool verify_hash(Object *o) { @@ -437,17 +337,38 @@ static bool verify_hash(Object *o) { return h1 == h2; } +static uint64_t minimum_header_size(Object *o) { + + static uint64_t table[] = { + [OBJECT_DATA] = sizeof(DataObject), + [OBJECT_FIELD] = sizeof(FieldObject), + [OBJECT_ENTRY] = sizeof(EntryObject), + [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject), + [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject), + [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject), + [OBJECT_TAG] = sizeof(TagObject), + }; + + if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0) + return sizeof(ObjectHeader); + + return table[o->object.type]; +} + int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) { int r; void *t; Object *o; uint64_t s; + unsigned context; assert(f); assert(ret); - assert(type < _OBJECT_TYPE_MAX); - r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t); + /* One context for each type, plus one catch-all for the rest */ + context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0; + + r = journal_file_move_to(f, context, offset, sizeof(ObjectHeader), &t); if (r < 0) return r; @@ -457,6 +378,12 @@ int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Objec if (s < sizeof(ObjectHeader)) return -EBADMSG; + if (o->object.type <= OBJECT_UNUSED) + return -EBADMSG; + + if (s < minimum_header_size(o)) + return -EBADMSG; + if (type >= 0 && o->object.type != type) return -EBADMSG; @@ -508,6 +435,7 @@ static int journal_file_append_object(JournalFile *f, int type, uint64_t size, O void *t; assert(f); + assert(type > 0 && type < _OBJECT_TYPE_MAX); assert(size >= sizeof(ObjectHeader)); assert(offset); assert(ret); @@ -613,7 +541,7 @@ static int journal_file_map_data_hash_table(JournalFile *f) { s = le64toh(f->header->data_hash_table_size); r = journal_file_move_to(f, - WINDOW_DATA_HASH_TABLE, + OBJECT_DATA_HASH_TABLE, p, s, &t); if (r < 0) @@ -634,7 +562,7 @@ static int journal_file_map_field_hash_table(JournalFile *f) { s = le64toh(f->header->field_hash_table_size); r = journal_file_move_to(f, - WINDOW_FIELD_HASH_TABLE, + OBJECT_FIELD_HASH_TABLE, p, s, &t); if (r < 0) @@ -2428,6 +2356,7 @@ int journal_file_open( bool compress, bool authenticate, JournalMetrics *metrics, + MMapCache *mmap_cache, JournalFile *template, JournalFile **ret) { @@ -2457,6 +2386,19 @@ int journal_file_open( f->compress = compress; f->authenticate = authenticate; + if (mmap_cache) + f->mmap = mmap_cache_ref(mmap_cache); + else { + /* One context for each type, plus the zeroth catchall + * context. One fd for the file plus one for each type + * (which we need during verification */ + f->mmap = mmap_cache_new(_OBJECT_TYPE_MAX, 1 + _OBJECT_TYPE_MAX); + if (!f->mmap) { + r = -ENOMEM; + goto fail; + } + } + f->path = strdup(fname); if (!f->path) { r = -ENOMEM; @@ -2605,7 +2547,7 @@ int journal_file_rotate(JournalFile **f, bool compress, bool authenticate) { old_file->header->state = STATE_ARCHIVED; - r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, authenticate, NULL, old_file, &new_file); + r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, authenticate, NULL, old_file->mmap, old_file, &new_file); journal_file_close(old_file); *f = new_file; @@ -2619,6 +2561,7 @@ int journal_file_open_reliably( bool compress, bool authenticate, JournalMetrics *metrics, + MMapCache *mmap, JournalFile *template, JournalFile **ret) { @@ -2626,7 +2569,7 @@ int journal_file_open_reliably( size_t l; char *p; - r = journal_file_open(fname, flags, mode, compress, authenticate, metrics, template, ret); + r = journal_file_open(fname, flags, mode, compress, authenticate, metrics, mmap, template, ret); if (r != -EBADMSG && /* corrupted */ r != -ENODATA && /* truncated */ r != -EHOSTDOWN && /* other machine */ @@ -2660,7 +2603,7 @@ int journal_file_open_reliably( log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname); - return journal_file_open(fname, flags, mode, compress, authenticate, metrics, template, ret); + return journal_file_open(fname, flags, mode, compress, authenticate, metrics, mmap, template, ret); } struct vacuum_info { diff --git a/src/journal/journal-file.h b/src/journal/journal-file.h index a16c8ffc3e6..11a1c7d6410 100644 --- a/src/journal/journal-file.h +++ b/src/journal/journal-file.h @@ -32,24 +32,7 @@ #include "sparse-endian.h" #include "journal-def.h" #include "util.h" - -typedef struct Window { - void *ptr; - uint64_t offset; - uint64_t size; -} Window; - -enum { - WINDOW_UNKNOWN = OBJECT_UNUSED, - WINDOW_DATA = OBJECT_DATA, - WINDOW_ENTRY = OBJECT_ENTRY, - WINDOW_DATA_HASH_TABLE = OBJECT_DATA_HASH_TABLE, - WINDOW_FIELD_HASH_TABLE = OBJECT_FIELD_HASH_TABLE, - WINDOW_ENTRY_ARRAY = OBJECT_ENTRY_ARRAY, - WINDOW_TAG = OBJECT_TAG, - WINDOW_HEADER, - _WINDOW_MAX -}; +#include "mmap-cache.h" typedef struct JournalMetrics { uint64_t max_use; @@ -76,11 +59,10 @@ typedef struct JournalFile { HashItem *data_hash_table; HashItem *field_hash_table; - Window windows[_WINDOW_MAX]; - uint64_t current_offset; JournalMetrics metrics; + MMapCache *mmap; #ifdef HAVE_XZ void *compress_buffer; @@ -108,6 +90,7 @@ int journal_file_open( bool compress, bool authenticate, JournalMetrics *metrics, + MMapCache *mmap, JournalFile *template, JournalFile **ret); @@ -120,6 +103,7 @@ int journal_file_open_reliably( bool compress, bool authenticate, JournalMetrics *metrics, + MMapCache *mmap, JournalFile *template, JournalFile **ret); diff --git a/src/journal/journal-internal.h b/src/journal/journal-internal.h index d17fdb2e92a..86519be3d41 100644 --- a/src/journal/journal-internal.h +++ b/src/journal/journal-internal.h @@ -92,6 +92,7 @@ struct sd_journal { char *path; Hashmap *files; + MMapCache *mmap; Location current_location; @@ -110,4 +111,3 @@ struct sd_journal { char *journal_make_match_string(sd_journal *j); void journal_print_header(sd_journal *j); - diff --git a/src/journal/journald.c b/src/journal/journald.c index 8c41d9bab1e..145663bf5b8 100644 --- a/src/journal/journald.c +++ b/src/journal/journald.c @@ -315,7 +315,7 @@ static JournalFile* find_journal(Server *s, uid_t uid) { journal_file_close(f); } - r = journal_file_open_reliably(p, O_RDWR|O_CREAT, 0640, s->compress, false, &s->system_metrics, s->system_journal, &f); + r = journal_file_open_reliably(p, O_RDWR|O_CREAT, 0640, s->compress, false, &s->system_metrics, s->mmap, s->system_journal, &f); free(p); if (r < 0) @@ -2006,7 +2006,7 @@ static int system_journal_open(Server *s) { if (!fn) return -ENOMEM; - r = journal_file_open_reliably(fn, O_RDWR|O_CREAT, 0640, s->compress, true, &s->system_metrics, NULL, &s->system_journal); + r = journal_file_open_reliably(fn, O_RDWR|O_CREAT, 0640, s->compress, true, &s->system_metrics, s->mmap, NULL, &s->system_journal); free(fn); if (r >= 0) @@ -2033,7 +2033,7 @@ static int system_journal_open(Server *s) { * if it already exists, so that we can flush * it into the system journal */ - r = journal_file_open(fn, O_RDWR, 0640, s->compress, false, &s->runtime_metrics, NULL, &s->runtime_journal); + r = journal_file_open(fn, O_RDWR, 0640, s->compress, false, &s->runtime_metrics, s->mmap, NULL, &s->runtime_journal); free(fn); if (r < 0) { @@ -2049,7 +2049,7 @@ static int system_journal_open(Server *s) { * it if necessary. */ (void) mkdir_parents(fn, 0755); - r = journal_file_open_reliably(fn, O_RDWR|O_CREAT, 0640, s->compress, false, &s->runtime_metrics, NULL, &s->runtime_journal); + r = journal_file_open_reliably(fn, O_RDWR|O_CREAT, 0640, s->compress, false, &s->runtime_metrics, s->mmap, NULL, &s->runtime_journal); free(fn); if (r < 0) { @@ -2793,6 +2793,10 @@ static int server_init(Server *s) { if (!s->user_journals) return log_oom(); + s->mmap = mmap_cache_new(_OBJECT_TYPE_MAX, USER_JOURNALS_MAX + 2); + if (!s->mmap) + return log_oom(); + s->epoll_fd = epoll_create1(EPOLL_CLOEXEC); if (s->epoll_fd < 0) { log_error("Failed to create epoll object: %m"); @@ -2919,6 +2923,9 @@ static void server_done(Server *s) { free(s->buffer); free(s->tty_path); + + if (s->mmap) + mmap_cache_unref(s->mmap); } int main(int argc, char *argv[]) { diff --git a/src/journal/journald.h b/src/journal/journald.h index d08a194780a..0202893d86a 100644 --- a/src/journal/journald.h +++ b/src/journal/journald.h @@ -93,6 +93,8 @@ typedef struct Server { Storage storage; + MMapCache *mmap; + bool dev_kmsg_readable; uint64_t *kernel_seqnum; diff --git a/src/journal/mmap-cache.c b/src/journal/mmap-cache.c new file mode 100644 index 00000000000..68dbe7015bc --- /dev/null +++ b/src/journal/mmap-cache.c @@ -0,0 +1,577 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2012 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include +#include + +#include "util.h" + +#include "mmap-cache.h" + +#define WINDOW_SIZE (8ULL*1024ULL*1024ULL) +#define WINDOWS_MAX 32 + +typedef struct Window { + int fd; + void *ptr; + uint64_t offset; + uint64_t size; + + unsigned n_ref; + unsigned lru_prev; + unsigned lru_next; + + unsigned by_fd_prev; + unsigned by_fd_next; +} Window; + +typedef struct FileDescriptor { + int fd; + unsigned windows; +} FileDescriptor; + +struct MMapCache { + unsigned n_ref; + + unsigned contexts_max; + unsigned windows_max; + unsigned fds_max; + + unsigned n_windows; + unsigned n_fds; + + unsigned lru_first, lru_last; + + Window *windows; + unsigned *by_context; + FileDescriptor *by_fd; +}; + +static void mmap_cache_window_unmap(MMapCache *m, unsigned w) { + Window *v; + + assert(m); + assert(w < m->n_windows); + + v = m->windows + w; + if (!v->ptr) + return; + + munmap(v->ptr, v->size); + v->ptr = NULL; +} + +static void mmap_cache_window_add_lru(MMapCache *m, unsigned w) { + Window *v; + + assert(m); + assert(w < m->n_windows); + + v = m->windows + w; + v->lru_prev = m->lru_last; + v->lru_next = (unsigned) -1; + + m->lru_last = w; + if (m->lru_first == (unsigned) -1) + m->lru_first = w; +} + +static void mmap_cache_window_remove_lru(MMapCache *m, unsigned w) { + Window *v; + + assert(m); + assert(w < m->n_windows); + + v = m->windows + w; + + if (v->lru_prev == (unsigned) -1) + m->lru_first = v->lru_next; + else + m->windows[v->lru_prev].lru_next = v->lru_next; + + if (v->lru_next == (unsigned) -1) + m->lru_last = v->lru_prev; + else + m->windows[v->lru_next].lru_prev = v->lru_prev; +} + +static void mmap_cache_fd_add(MMapCache *m, unsigned fd_index, unsigned w) { + Window *v; + + assert(m); + assert(fd_index < m->n_fds); + + v = m->windows + w; + v->by_fd_next = m->by_fd[fd_index].windows; + v->by_fd_prev = (unsigned) -1; + + m->by_fd[fd_index].windows = w; +} + +static void mmap_cache_fd_remove(MMapCache *m, unsigned fd_index, unsigned w) { + Window *v; + + assert(m); + assert(fd_index < m->n_fds); + + v = m->windows + w; + if (v->by_fd_prev == (unsigned) -1) + m->by_fd[fd_index].windows = v->by_fd_next; + else + m->windows[v->by_fd_prev].by_fd_next = v->by_fd_next; + + if (v->by_fd_next != (unsigned) -1) + m->windows[v->by_fd_next].by_fd_prev = v->by_fd_prev; +} + +static void mmap_cache_context_unset(MMapCache *m, unsigned c) { + Window *v; + unsigned w; + + assert(m); + assert(c < m->contexts_max); + + if (m->by_context[c] == (unsigned) -1) + return; + + w = m->by_context[c]; + m->by_context[c] = (unsigned) -1; + + v = m->windows + w; + assert(v->n_ref > 0); + v->n_ref --; + + if (v->n_ref == 0) + mmap_cache_window_add_lru(m, w); +} + +static void mmap_cache_context_set(MMapCache *m, unsigned c, unsigned w) { + Window *v; + + assert(m); + assert(c < m->contexts_max); + assert(w < m->n_windows); + + if (m->by_context[c] == w) + return; + + mmap_cache_context_unset(m, c); + + m->by_context[c] = w; + + v = m->windows + w; + v->n_ref ++; + if (v->n_ref == 1) + mmap_cache_window_remove_lru(m, w); +} + +static void mmap_cache_free(MMapCache *m) { + + assert(m); + + if (m->windows) { + unsigned w; + + for (w = 0; w < m->n_windows; w++) + mmap_cache_window_unmap(m, w); + + free(m->windows); + } + + free(m->by_context); + free(m->by_fd); + free(m); +} + +MMapCache* mmap_cache_new(unsigned contexts_max, unsigned fds_max) { + MMapCache *m; + + assert(contexts_max > 0); + assert(fds_max > 0); + + m = new0(MMapCache, 1); + if (!m) + return NULL; + + m->contexts_max = contexts_max; + m->fds_max = fds_max; + m->windows_max = MAX(m->contexts_max, WINDOWS_MAX); + m->n_ref = 1; + m->lru_first = (unsigned) -1; + m->lru_last = (unsigned) -1; + + m->windows = new(Window, m->windows_max); + if (!m->windows) { + mmap_cache_free(m); + return NULL; + } + + m->by_context = new(unsigned, m->contexts_max); + if (!m->by_context) { + mmap_cache_free(m); + return NULL; + } + + memset(m->by_context, -1, m->contexts_max * sizeof(unsigned)); + + m->by_fd = new(FileDescriptor, m->fds_max); + if (!m->by_fd) { + mmap_cache_free(m); + return NULL; + } + + return m; +} + +MMapCache* mmap_cache_ref(MMapCache *m) { + assert(m); + assert(m->n_ref > 0); + + m->n_ref++; + return m; +} + +MMapCache* mmap_cache_unref(MMapCache *m) { + assert(m); + assert(m->n_ref > 0); + + if (m->n_ref == 1) + mmap_cache_free(m); + else + m->n_ref--; + + return NULL; +} + +static int mmap_cache_allocate_window(MMapCache *m, unsigned *w) { + assert(m); + assert(w); + + if (m->n_windows < m->windows_max) { + *w = m->n_windows ++; + return 0; + } + + if (m->lru_first == (unsigned) -1) + return -E2BIG; + + *w = m->lru_first; + mmap_cache_window_unmap(m, *w); + mmap_cache_window_remove_lru(m, *w); + + return 0; +} + +static int mmap_cache_make_room(MMapCache *m) { + unsigned w; + + assert(m); + + w = m->lru_first; + while (w != (unsigned) -1) { + Window *v; + + v = m->windows + w; + + if (v->ptr) { + mmap_cache_window_unmap(m, w); + return 1; + } + + w = v->lru_next; + } + + return 0; +} + +static int mmap_cache_put( + MMapCache *m, + int fd, + unsigned fd_index, + int prot, + unsigned context, + uint64_t offset, + uint64_t size, + void **ret) { + + unsigned w; + Window *v; + void *d; + uint64_t woffset, wsize; + int r; + + assert(m); + assert(fd >= 0); + assert(context < m->contexts_max); + assert(size > 0); + assert(ret); + + woffset = offset & ~((uint64_t) page_size() - 1ULL); + wsize = size + (offset - woffset); + wsize = PAGE_ALIGN(wsize); + + if (wsize < WINDOW_SIZE) { + uint64_t delta; + + delta = (WINDOW_SIZE - wsize) / 2; + + if (delta > offset) + woffset = 0; + else + woffset -= delta; + + wsize = WINDOW_SIZE; + } + + for (;;) { + d = mmap(NULL, wsize, prot, MAP_SHARED, fd, woffset); + if (d != MAP_FAILED) + break; + if (errno != ENOMEM) + return -errno; + + r = mmap_cache_make_room(m); + if (r < 0) + return r; + if (r == 0) + return -ENOMEM; + } + + r = mmap_cache_allocate_window(m, &w); + if (r < 0) { + munmap(d, wsize); + return r; + } + + v = m->windows + w; + v->fd = fd; + v->ptr = d; + v->offset = woffset; + v->size = wsize; + + v->n_ref = 0; + v->lru_prev = v->lru_next = (unsigned) -1; + + mmap_cache_fd_add(m, fd_index, w); + mmap_cache_context_set(m, context, w); + + *ret = (uint8_t*) d + (offset - woffset); + return 1; +} + +static int fd_cmp(const void *_a, const void *_b) { + const FileDescriptor *a = _a, *b = _b; + + if (a->fd < b->fd) + return -1; + if (a->fd > b->fd) + return 1; + + return 0; +} + +static int mmap_cache_get_fd_index(MMapCache *m, int fd, unsigned *fd_index) { + FileDescriptor *j; + + assert(m); + assert(fd >= 0); + assert(fd_index); + + j = bsearch(&fd, m->by_fd, m->n_fds, sizeof(m->by_fd[0]), fd_cmp); + if (!j) { + if (m->n_fds >= m->fds_max) + return -E2BIG; + + j = m->by_fd + m->n_fds ++; + j->fd = fd; + j->windows = (unsigned) -1; + + qsort(m->by_fd, m->n_fds, sizeof(m->by_fd[0]), fd_cmp); + j = bsearch(&fd, m->by_fd, m->n_fds, sizeof(m->by_fd[0]), fd_cmp); + } + + *fd_index = (unsigned) (j - m->by_fd); + return 0; +} + +static bool mmap_cache_test_window( + MMapCache *m, + unsigned w, + uint64_t offset, + uint64_t size) { + Window *v; + + assert(m); + assert(w < m->n_windows); + assert(size > 0); + + v = m->windows + w; + + return offset >= v->offset && + offset + size <= v->offset + v->size; +} + +static int mmap_cache_current( + MMapCache *m, + int fd, + unsigned context, + uint64_t offset, + uint64_t size, + void **ret) { + + Window *v; + unsigned w; + + assert(m); + assert(fd >= 0); + assert(context < m->contexts_max); + assert(size > 0); + assert(ret); + + if (m->by_context[context] == (unsigned) -1) + return 0; + + w = m->by_context[context]; + v = m->windows + w; + + if (v->fd != fd) + return 0; + + if (!mmap_cache_test_window(m, w, offset, size)) + return 0; + + *ret = (uint8_t*) v->ptr + (offset - v->offset); + return 1; +} + +static int mmap_cache_find( + MMapCache *m, + unsigned fd_index, + unsigned context, + uint64_t offset, + uint64_t size, + void **ret) { + + Window *v = NULL; + unsigned w; + + assert(m); + assert(fd_index < m->n_fds); + assert(context < m->contexts_max); + assert(size > 0); + assert(ret); + + w = m->by_fd[fd_index].windows; + while (w != (unsigned) -1) { + if (mmap_cache_test_window(m, w, offset, size)) + break; + + w = m->windows[w].by_fd_next; + } + + if (w == (unsigned) -1) + return 0; + + mmap_cache_context_set(m, context, w); + + v = m->windows + w; + *ret = (uint8_t*) v->ptr + (offset - v->offset); + return 1; +} + +int mmap_cache_get( + MMapCache *m, + int fd, + int prot, + unsigned context, + uint64_t offset, + uint64_t size, + void **ret) { + + unsigned fd_index; + int r; + + assert(m); + assert(fd >= 0); + assert(context < m->contexts_max); + assert(size > 0); + assert(ret); + + /* Maybe the current pointer for this context is already the + * right one? */ + r = mmap_cache_current(m, fd, context, offset, size, ret); + if (r != 0) + return r; + + /* OK, let's find the chain for this FD */ + r = mmap_cache_get_fd_index(m, fd, &fd_index); + if (r < 0) + return r; + + /* And let's look through the available mmaps */ + r = mmap_cache_find(m, fd_index, context, offset, size, ret); + if (r != 0) + return r; + + /* Not found? Then, let's add it */ + return mmap_cache_put(m, fd, fd_index, prot, context, offset, size, ret); +} + +void mmap_cache_close_fd(MMapCache *m, int fd) { + FileDescriptor *j; + unsigned fd_index, c, w; + + assert(m); + assert(fd > 0); + + j = bsearch(&fd, m->by_fd, m->n_fds, sizeof(m->by_fd[0]), fd_cmp); + if (!j) + return; + fd_index = (unsigned) (j - m->by_fd); + + for (c = 0; c < m->contexts_max; c++) { + w = m->by_context[c]; + if (w == (unsigned) -1) + continue; + + if (m->windows[w].fd == fd) + mmap_cache_context_unset(m, c); + } + + w = m->by_fd[fd_index].windows; + while (w != (unsigned) -1) { + + mmap_cache_fd_remove(m, fd_index, w); + mmap_cache_window_unmap(m, w); + + w = m->by_fd[fd_index].windows; + } + + memmove(m->by_fd + fd_index, m->by_fd + fd_index + 1, (m->n_fds - (fd_index + 1)) * sizeof(FileDescriptor)); + m->n_fds --; +} + +void mmap_cache_close_context(MMapCache *m, unsigned context) { + mmap_cache_context_unset(m, context); +} diff --git a/src/journal/mmap-cache.h b/src/journal/mmap-cache.h new file mode 100644 index 00000000000..0a88fc584fd --- /dev/null +++ b/src/journal/mmap-cache.h @@ -0,0 +1,34 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +#pragma once + +/*** + This file is part of systemd. + + Copyright 2012 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include + +typedef struct MMapCache MMapCache; + +MMapCache* mmap_cache_new(unsigned contexts_max, unsigned fds_max); +MMapCache* mmap_cache_ref(MMapCache *m); +MMapCache* mmap_cache_unref(MMapCache *m); + +int mmap_cache_get(MMapCache *m, int fd, int prot, unsigned context, uint64_t offset, uint64_t size, void **ret); +void mmap_cache_close_fd(MMapCache *m, int fd); +void mmap_cache_close_context(MMapCache *m, unsigned context); diff --git a/src/journal/sd-journal.c b/src/journal/sd-journal.c index 359a7cac3e4..41526b35bf8 100644 --- a/src/journal/sd-journal.c +++ b/src/journal/sd-journal.c @@ -1118,7 +1118,7 @@ static int add_file(sd_journal *j, const char *prefix, const char *filename) { return 0; } - r = journal_file_open(path, O_RDONLY, 0, false, false, NULL, NULL, &f); + r = journal_file_open(path, O_RDONLY, 0, false, false, NULL, j->mmap, NULL, &f); free(path); if (r < 0) { @@ -1439,6 +1439,17 @@ static sd_journal *journal_new(int flags, const char *path) { return NULL; } + /* One context for each type, plus the zeroth catchall + * context. One fd for each file plus one for each type, which + * is need when verifying things */ + j->mmap = mmap_cache_new(_OBJECT_TYPE_MAX, JOURNAL_FILES_MAX + _OBJECT_TYPE_MAX); + if (!j->mmap) { + hashmap_free(j->files); + hashmap_free(j->directories_by_path); + free(j->path); + free(j); + } + return j; } @@ -1527,6 +1538,9 @@ _public_ void sd_journal_close(sd_journal *j) { sd_journal_flush_matches(j); + if (j->mmap) + mmap_cache_unref(j->mmap); + free(j->path); free(j); } diff --git a/src/journal/test-journal-stream.c b/src/journal/test-journal-stream.c index 0925995fca6..707dcc178b6 100644 --- a/src/journal/test-journal-stream.c +++ b/src/journal/test-journal-stream.c @@ -79,9 +79,9 @@ int main(int argc, char *argv[]) { assert_se(mkdtemp(t)); assert_se(chdir(t) >= 0); - assert_se(journal_file_open("one.journal", O_RDWR|O_CREAT, 0666, true, false, NULL, NULL, &one) == 0); - assert_se(journal_file_open("two.journal", O_RDWR|O_CREAT, 0666, true, false, NULL, NULL, &two) == 0); - assert_se(journal_file_open("three.journal", O_RDWR|O_CREAT, 0666, true, false, NULL, NULL, &three) == 0); + assert_se(journal_file_open("one.journal", O_RDWR|O_CREAT, 0666, true, false, NULL, NULL, NULL, &one) == 0); + assert_se(journal_file_open("two.journal", O_RDWR|O_CREAT, 0666, true, false, NULL, NULL, NULL, &two) == 0); + assert_se(journal_file_open("three.journal", O_RDWR|O_CREAT, 0666, true, false, NULL, NULL, NULL, &three) == 0); for (i = 0; i < N_ENTRIES; i++) { char *p, *q; diff --git a/src/journal/test-journal.c b/src/journal/test-journal.c index 8f01b4d82e7..2fd19a755b5 100644 --- a/src/journal/test-journal.c +++ b/src/journal/test-journal.c @@ -41,7 +41,7 @@ int main(int argc, char *argv[]) { assert_se(mkdtemp(t)); assert_se(chdir(t) >= 0); - assert_se(journal_file_open("test.journal", O_RDWR|O_CREAT, 0666, true, false, NULL, NULL, &f) == 0); + assert_se(journal_file_open("test.journal", O_RDWR|O_CREAT, 0666, true, true, NULL, NULL, NULL, &f) == 0); dual_timestamp_get(&ts);