2018-07-13 03:39:21 +08:00
|
|
|
#include "cache.h"
|
2018-07-13 03:39:33 +08:00
|
|
|
#include "config.h"
|
2018-07-13 03:39:22 +08:00
|
|
|
#include "csum-file.h"
|
2018-07-13 03:39:26 +08:00
|
|
|
#include "dir.h"
|
2018-07-13 03:39:22 +08:00
|
|
|
#include "lockfile.h"
|
2018-07-13 03:39:26 +08:00
|
|
|
#include "packfile.h"
|
2018-07-13 03:39:23 +08:00
|
|
|
#include "object-store.h"
|
2020-12-31 19:56:23 +08:00
|
|
|
#include "hash-lookup.h"
|
2018-07-13 03:39:21 +08:00
|
|
|
#include "midx.h"
|
2018-09-14 02:02:26 +08:00
|
|
|
#include "progress.h"
|
2019-03-22 03:36:13 +08:00
|
|
|
#include "trace2.h"
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
#include "run-command.h"
|
2020-09-25 20:33:34 +08:00
|
|
|
#include "repository.h"
|
2021-02-18 22:07:33 +08:00
|
|
|
#include "chunk-format.h"
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
#include "pack.h"
|
2018-07-13 03:39:21 +08:00
|
|
|
|
2018-07-13 03:39:22 +08:00
|
|
|
#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
|
|
|
|
#define MIDX_VERSION 1
|
2018-07-13 03:39:23 +08:00
|
|
|
#define MIDX_BYTE_FILE_VERSION 4
|
|
|
|
#define MIDX_BYTE_HASH_VERSION 5
|
|
|
|
#define MIDX_BYTE_NUM_CHUNKS 6
|
|
|
|
#define MIDX_BYTE_NUM_PACKS 8
|
2018-07-13 03:39:22 +08:00
|
|
|
#define MIDX_HEADER_SIZE 12
|
2019-08-19 04:04:27 +08:00
|
|
|
#define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + the_hash_algo->rawsz)
|
2018-07-13 03:39:22 +08:00
|
|
|
|
2018-07-13 03:39:27 +08:00
|
|
|
#define MIDX_CHUNK_ALIGNMENT 4
|
|
|
|
#define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
|
2018-07-13 03:39:31 +08:00
|
|
|
#define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
|
2018-07-13 03:39:30 +08:00
|
|
|
#define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
|
2018-07-13 03:39:32 +08:00
|
|
|
#define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */
|
|
|
|
#define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */
|
2018-07-13 03:39:31 +08:00
|
|
|
#define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256)
|
2018-07-13 03:39:32 +08:00
|
|
|
#define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t))
|
|
|
|
#define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t))
|
|
|
|
#define MIDX_LARGE_OFFSET_NEEDED 0x80000000
|
2018-07-13 03:39:27 +08:00
|
|
|
|
2019-06-11 07:35:25 +08:00
|
|
|
#define PACK_EXPIRED UINT_MAX
|
|
|
|
|
2020-08-17 22:04:48 +08:00
|
|
|
static uint8_t oid_version(void)
|
|
|
|
{
|
|
|
|
switch (hash_algo_by_ptr(the_hash_algo)) {
|
|
|
|
case GIT_HASH_SHA1:
|
|
|
|
return 1;
|
|
|
|
case GIT_HASH_SHA256:
|
|
|
|
return 2;
|
|
|
|
default:
|
|
|
|
die(_("invalid hash version"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-01 04:52:21 +08:00
|
|
|
const unsigned char *get_midx_checksum(struct multi_pack_index *m)
|
pack-revindex: read multi-pack reverse indexes
Implement reading for multi-pack reverse indexes, as described in the
previous patch.
Note that these functions don't yet have any callers, and won't until
multi-pack reachability bitmaps are introduced in a later patch series.
In the meantime, this patch implements some of the infrastructure
necessary to support multi-pack bitmaps.
There are three new functions exposed by the revindex API:
- load_midx_revindex(): loads the reverse index corresponding to the
given multi-pack index.
- midx_to_pack_pos() and pack_pos_to_midx(): these convert between the
multi-pack index and pseudo-pack order.
load_midx_revindex() and pack_pos_to_midx() are both relatively
straightforward.
load_midx_revindex() needs a few functions to be exposed from the midx
API. One to get the checksum of a midx, and another to get the .rev's
filename. Similar to recent changes in the packed_git struct, three new
fields are added to the multi_pack_index struct: one to keep track of
the size, one to keep track of the mmap'd pointer, and another to point
past the header and at the reverse index's data.
pack_pos_to_midx() simply reads the corresponding entry out of the
table.
midx_to_pack_pos() is the trickiest, since it needs to find an object's
position in the psuedo-pack order, but that order can only be recovered
in the .rev file itself. This mapping can be implemented with a binary
search, but note that the thing we're binary searching over isn't an
array of values, but rather a permuted order of those values.
So, when comparing two items, it's helpful to keep in mind the
difference. Instead of a traditional binary search, where you are
comparing two things directly, here we're comparing a (pack, offset)
tuple with an index into the multi-pack index. That index describes
another (pack, offset) tuple, and it is _those_ two tuples that are
compared.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:26 +08:00
|
|
|
{
|
|
|
|
return m->data + m->data_len - the_hash_algo->rawsz;
|
|
|
|
}
|
|
|
|
|
2021-09-01 04:52:21 +08:00
|
|
|
char *get_midx_filename(const char *object_dir)
|
2018-07-13 03:39:22 +08:00
|
|
|
{
|
|
|
|
return xstrfmt("%s/pack/multi-pack-index", object_dir);
|
|
|
|
}
|
|
|
|
|
pack-revindex: read multi-pack reverse indexes
Implement reading for multi-pack reverse indexes, as described in the
previous patch.
Note that these functions don't yet have any callers, and won't until
multi-pack reachability bitmaps are introduced in a later patch series.
In the meantime, this patch implements some of the infrastructure
necessary to support multi-pack bitmaps.
There are three new functions exposed by the revindex API:
- load_midx_revindex(): loads the reverse index corresponding to the
given multi-pack index.
- midx_to_pack_pos() and pack_pos_to_midx(): these convert between the
multi-pack index and pseudo-pack order.
load_midx_revindex() and pack_pos_to_midx() are both relatively
straightforward.
load_midx_revindex() needs a few functions to be exposed from the midx
API. One to get the checksum of a midx, and another to get the .rev's
filename. Similar to recent changes in the packed_git struct, three new
fields are added to the multi_pack_index struct: one to keep track of
the size, one to keep track of the mmap'd pointer, and another to point
past the header and at the reverse index's data.
pack_pos_to_midx() simply reads the corresponding entry out of the
table.
midx_to_pack_pos() is the trickiest, since it needs to find an object's
position in the psuedo-pack order, but that order can only be recovered
in the .rev file itself. This mapping can be implemented with a binary
search, but note that the thing we're binary searching over isn't an
array of values, but rather a permuted order of those values.
So, when comparing two items, it's helpful to keep in mind the
difference. Instead of a traditional binary search, where you are
comparing two things directly, here we're comparing a (pack, offset)
tuple with an index into the multi-pack index. That index describes
another (pack, offset) tuple, and it is _those_ two tuples that are
compared.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:26 +08:00
|
|
|
char *get_midx_rev_filename(struct multi_pack_index *m)
|
|
|
|
{
|
|
|
|
return xstrfmt("%s/pack/multi-pack-index-%s.rev",
|
|
|
|
m->object_dir, hash_to_hex(get_midx_checksum(m)));
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:36 +08:00
|
|
|
static int midx_read_oid_fanout(const unsigned char *chunk_start,
|
|
|
|
size_t chunk_size, void *data)
|
|
|
|
{
|
|
|
|
struct multi_pack_index *m = data;
|
|
|
|
m->chunk_oid_fanout = (uint32_t *)chunk_start;
|
|
|
|
|
|
|
|
if (chunk_size != 4 * 256) {
|
|
|
|
error(_("multi-pack-index OID fanout is of the wrong size"));
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-08-21 00:51:55 +08:00
|
|
|
struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local)
|
2018-07-13 03:39:23 +08:00
|
|
|
{
|
|
|
|
struct multi_pack_index *m = NULL;
|
|
|
|
int fd;
|
|
|
|
struct stat st;
|
|
|
|
size_t midx_size;
|
|
|
|
void *midx_map = NULL;
|
|
|
|
uint32_t hash_version;
|
|
|
|
char *midx_name = get_midx_filename(object_dir);
|
2018-07-13 03:39:27 +08:00
|
|
|
uint32_t i;
|
2018-07-13 03:39:28 +08:00
|
|
|
const char *cur_pack_name;
|
2021-02-18 22:07:36 +08:00
|
|
|
struct chunkfile *cf = NULL;
|
2018-07-13 03:39:23 +08:00
|
|
|
|
|
|
|
fd = git_open(midx_name);
|
|
|
|
|
|
|
|
if (fd < 0)
|
|
|
|
goto cleanup_fail;
|
|
|
|
if (fstat(fd, &st)) {
|
|
|
|
error_errno(_("failed to read %s"), midx_name);
|
|
|
|
goto cleanup_fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
midx_size = xsize_t(st.st_size);
|
|
|
|
|
|
|
|
if (midx_size < MIDX_MIN_SIZE) {
|
|
|
|
error(_("multi-pack-index file %s is too small"), midx_name);
|
|
|
|
goto cleanup_fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
FREE_AND_NULL(midx_name);
|
|
|
|
|
|
|
|
midx_map = xmmap(NULL, midx_size, PROT_READ, MAP_PRIVATE, fd, 0);
|
2020-04-24 21:17:16 +08:00
|
|
|
close(fd);
|
2018-07-13 03:39:23 +08:00
|
|
|
|
2019-04-04 06:00:05 +08:00
|
|
|
FLEX_ALLOC_STR(m, object_dir, object_dir);
|
2018-07-13 03:39:23 +08:00
|
|
|
m->data = midx_map;
|
|
|
|
m->data_len = midx_size;
|
2018-08-21 00:51:55 +08:00
|
|
|
m->local = local;
|
2018-07-13 03:39:23 +08:00
|
|
|
|
|
|
|
m->signature = get_be32(m->data);
|
2018-09-14 02:02:15 +08:00
|
|
|
if (m->signature != MIDX_SIGNATURE)
|
|
|
|
die(_("multi-pack-index signature 0x%08x does not match signature 0x%08x"),
|
2018-07-13 03:39:23 +08:00
|
|
|
m->signature, MIDX_SIGNATURE);
|
|
|
|
|
|
|
|
m->version = m->data[MIDX_BYTE_FILE_VERSION];
|
2018-09-14 02:02:15 +08:00
|
|
|
if (m->version != MIDX_VERSION)
|
|
|
|
die(_("multi-pack-index version %d not recognized"),
|
2018-07-13 03:39:23 +08:00
|
|
|
m->version);
|
|
|
|
|
|
|
|
hash_version = m->data[MIDX_BYTE_HASH_VERSION];
|
2020-08-17 22:04:48 +08:00
|
|
|
if (hash_version != oid_version()) {
|
|
|
|
error(_("multi-pack-index hash version %u does not match version %u"),
|
|
|
|
hash_version, oid_version());
|
|
|
|
goto cleanup_fail;
|
|
|
|
}
|
2019-08-19 04:04:27 +08:00
|
|
|
m->hash_len = the_hash_algo->rawsz;
|
2018-07-13 03:39:23 +08:00
|
|
|
|
|
|
|
m->num_chunks = m->data[MIDX_BYTE_NUM_CHUNKS];
|
|
|
|
|
|
|
|
m->num_packs = get_be32(m->data + MIDX_BYTE_NUM_PACKS);
|
|
|
|
|
2021-02-18 22:07:36 +08:00
|
|
|
cf = init_chunkfile(NULL);
|
2018-07-13 03:39:27 +08:00
|
|
|
|
2021-02-18 22:07:36 +08:00
|
|
|
if (read_table_of_contents(cf, m->data, midx_size,
|
|
|
|
MIDX_HEADER_SIZE, m->num_chunks))
|
|
|
|
goto cleanup_fail;
|
2018-07-13 03:39:27 +08:00
|
|
|
|
2021-02-18 22:07:36 +08:00
|
|
|
if (pair_chunk(cf, MIDX_CHUNKID_PACKNAMES, &m->chunk_pack_names) == CHUNK_NOT_FOUND)
|
2018-07-13 03:39:27 +08:00
|
|
|
die(_("multi-pack-index missing required pack-name chunk"));
|
2021-02-18 22:07:36 +08:00
|
|
|
if (read_chunk(cf, MIDX_CHUNKID_OIDFANOUT, midx_read_oid_fanout, m) == CHUNK_NOT_FOUND)
|
2018-07-13 03:39:31 +08:00
|
|
|
die(_("multi-pack-index missing required OID fanout chunk"));
|
2021-02-18 22:07:36 +08:00
|
|
|
if (pair_chunk(cf, MIDX_CHUNKID_OIDLOOKUP, &m->chunk_oid_lookup) == CHUNK_NOT_FOUND)
|
2018-07-13 03:39:30 +08:00
|
|
|
die(_("multi-pack-index missing required OID lookup chunk"));
|
2021-02-18 22:07:36 +08:00
|
|
|
if (pair_chunk(cf, MIDX_CHUNKID_OBJECTOFFSETS, &m->chunk_object_offsets) == CHUNK_NOT_FOUND)
|
2018-07-13 03:39:32 +08:00
|
|
|
die(_("multi-pack-index missing required object offsets chunk"));
|
2018-07-13 03:39:27 +08:00
|
|
|
|
2021-02-18 22:07:36 +08:00
|
|
|
pair_chunk(cf, MIDX_CHUNKID_LARGEOFFSETS, &m->chunk_large_offsets);
|
|
|
|
|
2018-07-13 03:39:31 +08:00
|
|
|
m->num_objects = ntohl(m->chunk_oid_fanout[255]);
|
|
|
|
|
2021-03-14 00:17:22 +08:00
|
|
|
CALLOC_ARRAY(m->pack_names, m->num_packs);
|
|
|
|
CALLOC_ARRAY(m->packs, m->num_packs);
|
2018-07-13 03:39:28 +08:00
|
|
|
|
|
|
|
cur_pack_name = (const char *)m->chunk_pack_names;
|
|
|
|
for (i = 0; i < m->num_packs; i++) {
|
|
|
|
m->pack_names[i] = cur_pack_name;
|
|
|
|
|
|
|
|
cur_pack_name += strlen(cur_pack_name) + 1;
|
|
|
|
|
2018-09-14 02:02:18 +08:00
|
|
|
if (i && strcmp(m->pack_names[i], m->pack_names[i - 1]) <= 0)
|
|
|
|
die(_("multi-pack-index pack names out of order: '%s' before '%s'"),
|
2018-07-13 03:39:28 +08:00
|
|
|
m->pack_names[i - 1],
|
|
|
|
m->pack_names[i]);
|
|
|
|
}
|
|
|
|
|
2019-03-22 03:36:13 +08:00
|
|
|
trace2_data_intmax("midx", the_repository, "load/num_packs", m->num_packs);
|
|
|
|
trace2_data_intmax("midx", the_repository, "load/num_objects", m->num_objects);
|
|
|
|
|
2018-07-13 03:39:23 +08:00
|
|
|
return m;
|
|
|
|
|
|
|
|
cleanup_fail:
|
|
|
|
free(m);
|
|
|
|
free(midx_name);
|
2021-02-18 22:07:36 +08:00
|
|
|
free(cf);
|
2018-07-13 03:39:23 +08:00
|
|
|
if (midx_map)
|
|
|
|
munmap(midx_map, midx_size);
|
|
|
|
if (0 <= fd)
|
|
|
|
close(fd);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-10-13 01:34:19 +08:00
|
|
|
void close_midx(struct multi_pack_index *m)
|
2018-07-13 03:39:36 +08:00
|
|
|
{
|
|
|
|
uint32_t i;
|
2018-10-13 01:34:19 +08:00
|
|
|
|
|
|
|
if (!m)
|
|
|
|
return;
|
|
|
|
|
2021-09-01 04:52:07 +08:00
|
|
|
close_midx(m->next);
|
|
|
|
|
2018-07-13 03:39:36 +08:00
|
|
|
munmap((unsigned char *)m->data, m->data_len);
|
|
|
|
|
|
|
|
for (i = 0; i < m->num_packs; i++) {
|
midx: add packs to packed_git linked list
The multi-pack-index allows searching for objects across multiple
packs using one object list. The original design gains many of
these performance benefits by keeping the packs in the
multi-pack-index out of the packed_git list.
Unfortunately, this has one major drawback. If the multi-pack-index
covers thousands of packs, and a command loads many of those packs,
then we can hit the limit for open file descriptors. The
close_one_pack() method is used to limit this resource, but it
only looks at the packed_git list, and uses an LRU cache to prevent
thrashing.
Instead of complicating this close_one_pack() logic to include
direct references to the multi-pack-index, simply add the packs
opened by the multi-pack-index to the packed_git list. This
immediately solves the file-descriptor limit problem, but requires
some extra steps to avoid performance issues or other problems:
1. Create a multi_pack_index bit in the packed_git struct that is
one if and only if the pack was loaded from a multi-pack-index.
2. Skip packs with the multi_pack_index bit when doing object
lookups and abbreviations. These algorithms already check the
multi-pack-index before the packed_git struct. This has a very
small performance hit, as we need to walk more packed_git
structs. This is acceptable, since these operations run binary
search on the other packs, so this walk-and-ignore logic is
very fast by comparison.
3. When closing a multi-pack-index file, do not close its packs,
as those packs will be closed using close_all_packs(). In some
cases, such as 'git repack', we run 'close_midx()' without also
closing the packs, so we need to un-set the multi_pack_index bit
in those packs. This is necessary, and caught by running
t6501-freshen-objects.sh with GIT_TEST_MULTI_PACK_INDEX=1.
To manually test this change, I inserted trace2 logging into
close_pack_fd() and set pack_max_fds to 10, then ran 'git rev-list
--all --objects' on a copy of the Git repo with 300+ pack-files and
a multi-pack-index. The logs verified the packs are closed as
we read them beyond the file descriptor limit.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-30 00:18:56 +08:00
|
|
|
if (m->packs[i])
|
|
|
|
m->packs[i]->multi_pack_index = 0;
|
2018-07-13 03:39:36 +08:00
|
|
|
}
|
|
|
|
FREE_AND_NULL(m->packs);
|
|
|
|
FREE_AND_NULL(m->pack_names);
|
2021-09-01 04:52:07 +08:00
|
|
|
free(m);
|
2018-07-13 03:39:36 +08:00
|
|
|
}
|
|
|
|
|
2019-04-30 00:18:55 +08:00
|
|
|
int prepare_midx_pack(struct repository *r, struct multi_pack_index *m, uint32_t pack_int_id)
|
2018-07-13 03:39:34 +08:00
|
|
|
{
|
|
|
|
struct strbuf pack_name = STRBUF_INIT;
|
midx: add packs to packed_git linked list
The multi-pack-index allows searching for objects across multiple
packs using one object list. The original design gains many of
these performance benefits by keeping the packs in the
multi-pack-index out of the packed_git list.
Unfortunately, this has one major drawback. If the multi-pack-index
covers thousands of packs, and a command loads many of those packs,
then we can hit the limit for open file descriptors. The
close_one_pack() method is used to limit this resource, but it
only looks at the packed_git list, and uses an LRU cache to prevent
thrashing.
Instead of complicating this close_one_pack() logic to include
direct references to the multi-pack-index, simply add the packs
opened by the multi-pack-index to the packed_git list. This
immediately solves the file-descriptor limit problem, but requires
some extra steps to avoid performance issues or other problems:
1. Create a multi_pack_index bit in the packed_git struct that is
one if and only if the pack was loaded from a multi-pack-index.
2. Skip packs with the multi_pack_index bit when doing object
lookups and abbreviations. These algorithms already check the
multi-pack-index before the packed_git struct. This has a very
small performance hit, as we need to walk more packed_git
structs. This is acceptable, since these operations run binary
search on the other packs, so this walk-and-ignore logic is
very fast by comparison.
3. When closing a multi-pack-index file, do not close its packs,
as those packs will be closed using close_all_packs(). In some
cases, such as 'git repack', we run 'close_midx()' without also
closing the packs, so we need to un-set the multi_pack_index bit
in those packs. This is necessary, and caught by running
t6501-freshen-objects.sh with GIT_TEST_MULTI_PACK_INDEX=1.
To manually test this change, I inserted trace2 logging into
close_pack_fd() and set pack_max_fds to 10, then ran 'git rev-list
--all --objects' on a copy of the Git repo with 300+ pack-files and
a multi-pack-index. The logs verified the packs are closed as
we read them beyond the file descriptor limit.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-30 00:18:56 +08:00
|
|
|
struct packed_git *p;
|
2018-07-13 03:39:34 +08:00
|
|
|
|
|
|
|
if (pack_int_id >= m->num_packs)
|
2018-11-29 05:43:09 +08:00
|
|
|
die(_("bad pack-int-id: %u (%u total packs)"),
|
2018-09-14 02:02:25 +08:00
|
|
|
pack_int_id, m->num_packs);
|
2018-07-13 03:39:34 +08:00
|
|
|
|
|
|
|
if (m->packs[pack_int_id])
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
strbuf_addf(&pack_name, "%s/pack/%s", m->object_dir,
|
|
|
|
m->pack_names[pack_int_id]);
|
|
|
|
|
midx: add packs to packed_git linked list
The multi-pack-index allows searching for objects across multiple
packs using one object list. The original design gains many of
these performance benefits by keeping the packs in the
multi-pack-index out of the packed_git list.
Unfortunately, this has one major drawback. If the multi-pack-index
covers thousands of packs, and a command loads many of those packs,
then we can hit the limit for open file descriptors. The
close_one_pack() method is used to limit this resource, but it
only looks at the packed_git list, and uses an LRU cache to prevent
thrashing.
Instead of complicating this close_one_pack() logic to include
direct references to the multi-pack-index, simply add the packs
opened by the multi-pack-index to the packed_git list. This
immediately solves the file-descriptor limit problem, but requires
some extra steps to avoid performance issues or other problems:
1. Create a multi_pack_index bit in the packed_git struct that is
one if and only if the pack was loaded from a multi-pack-index.
2. Skip packs with the multi_pack_index bit when doing object
lookups and abbreviations. These algorithms already check the
multi-pack-index before the packed_git struct. This has a very
small performance hit, as we need to walk more packed_git
structs. This is acceptable, since these operations run binary
search on the other packs, so this walk-and-ignore logic is
very fast by comparison.
3. When closing a multi-pack-index file, do not close its packs,
as those packs will be closed using close_all_packs(). In some
cases, such as 'git repack', we run 'close_midx()' without also
closing the packs, so we need to un-set the multi_pack_index bit
in those packs. This is necessary, and caught by running
t6501-freshen-objects.sh with GIT_TEST_MULTI_PACK_INDEX=1.
To manually test this change, I inserted trace2 logging into
close_pack_fd() and set pack_max_fds to 10, then ran 'git rev-list
--all --objects' on a copy of the Git repo with 300+ pack-files and
a multi-pack-index. The logs verified the packs are closed as
we read them beyond the file descriptor limit.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-30 00:18:56 +08:00
|
|
|
p = add_packed_git(pack_name.buf, pack_name.len, m->local);
|
2018-07-13 03:39:34 +08:00
|
|
|
strbuf_release(&pack_name);
|
midx: add packs to packed_git linked list
The multi-pack-index allows searching for objects across multiple
packs using one object list. The original design gains many of
these performance benefits by keeping the packs in the
multi-pack-index out of the packed_git list.
Unfortunately, this has one major drawback. If the multi-pack-index
covers thousands of packs, and a command loads many of those packs,
then we can hit the limit for open file descriptors. The
close_one_pack() method is used to limit this resource, but it
only looks at the packed_git list, and uses an LRU cache to prevent
thrashing.
Instead of complicating this close_one_pack() logic to include
direct references to the multi-pack-index, simply add the packs
opened by the multi-pack-index to the packed_git list. This
immediately solves the file-descriptor limit problem, but requires
some extra steps to avoid performance issues or other problems:
1. Create a multi_pack_index bit in the packed_git struct that is
one if and only if the pack was loaded from a multi-pack-index.
2. Skip packs with the multi_pack_index bit when doing object
lookups and abbreviations. These algorithms already check the
multi-pack-index before the packed_git struct. This has a very
small performance hit, as we need to walk more packed_git
structs. This is acceptable, since these operations run binary
search on the other packs, so this walk-and-ignore logic is
very fast by comparison.
3. When closing a multi-pack-index file, do not close its packs,
as those packs will be closed using close_all_packs(). In some
cases, such as 'git repack', we run 'close_midx()' without also
closing the packs, so we need to un-set the multi_pack_index bit
in those packs. This is necessary, and caught by running
t6501-freshen-objects.sh with GIT_TEST_MULTI_PACK_INDEX=1.
To manually test this change, I inserted trace2 logging into
close_pack_fd() and set pack_max_fds to 10, then ran 'git rev-list
--all --objects' on a copy of the Git repo with 300+ pack-files and
a multi-pack-index. The logs verified the packs are closed as
we read them beyond the file descriptor limit.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-30 00:18:56 +08:00
|
|
|
|
|
|
|
if (!p)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
p->multi_pack_index = 1;
|
|
|
|
m->packs[pack_int_id] = p;
|
|
|
|
install_packed_git(r, p);
|
|
|
|
list_add_tail(&p->mru, &r->objects->packed_git_mru);
|
|
|
|
|
|
|
|
return 0;
|
2018-07-13 03:39:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int bsearch_midx(const struct object_id *oid, struct multi_pack_index *m, uint32_t *result)
|
|
|
|
{
|
|
|
|
return bsearch_hash(oid->hash, m->chunk_oid_fanout, m->chunk_oid_lookup,
|
2019-08-19 04:04:27 +08:00
|
|
|
the_hash_algo->rawsz, result);
|
2018-07-13 03:39:34 +08:00
|
|
|
}
|
|
|
|
|
2018-07-13 03:39:35 +08:00
|
|
|
struct object_id *nth_midxed_object_oid(struct object_id *oid,
|
|
|
|
struct multi_pack_index *m,
|
|
|
|
uint32_t n)
|
|
|
|
{
|
|
|
|
if (n >= m->num_objects)
|
|
|
|
return NULL;
|
|
|
|
|
2021-04-26 09:02:50 +08:00
|
|
|
oidread(oid, m->chunk_oid_lookup + m->hash_len * n);
|
2018-07-13 03:39:35 +08:00
|
|
|
return oid;
|
|
|
|
}
|
|
|
|
|
2021-03-30 23:04:20 +08:00
|
|
|
off_t nth_midxed_offset(struct multi_pack_index *m, uint32_t pos)
|
2018-07-13 03:39:34 +08:00
|
|
|
{
|
|
|
|
const unsigned char *offset_data;
|
|
|
|
uint32_t offset32;
|
|
|
|
|
2021-02-18 22:07:37 +08:00
|
|
|
offset_data = m->chunk_object_offsets + (off_t)pos * MIDX_CHUNK_OFFSET_WIDTH;
|
2018-07-13 03:39:34 +08:00
|
|
|
offset32 = get_be32(offset_data + sizeof(uint32_t));
|
|
|
|
|
|
|
|
if (m->chunk_large_offsets && offset32 & MIDX_LARGE_OFFSET_NEEDED) {
|
2018-09-14 02:02:23 +08:00
|
|
|
if (sizeof(off_t) < sizeof(uint64_t))
|
2018-07-13 03:39:34 +08:00
|
|
|
die(_("multi-pack-index stores a 64-bit offset, but off_t is too small"));
|
|
|
|
|
|
|
|
offset32 ^= MIDX_LARGE_OFFSET_NEEDED;
|
|
|
|
return get_be64(m->chunk_large_offsets + sizeof(uint64_t) * offset32);
|
|
|
|
}
|
|
|
|
|
|
|
|
return offset32;
|
|
|
|
}
|
|
|
|
|
2021-03-30 23:04:20 +08:00
|
|
|
uint32_t nth_midxed_pack_int_id(struct multi_pack_index *m, uint32_t pos)
|
2018-07-13 03:39:34 +08:00
|
|
|
{
|
2021-02-18 22:07:37 +08:00
|
|
|
return get_be32(m->chunk_object_offsets +
|
|
|
|
(off_t)pos * MIDX_CHUNK_OFFSET_WIDTH);
|
2018-07-13 03:39:34 +08:00
|
|
|
}
|
|
|
|
|
2019-04-30 00:18:55 +08:00
|
|
|
static int nth_midxed_pack_entry(struct repository *r,
|
|
|
|
struct multi_pack_index *m,
|
|
|
|
struct pack_entry *e,
|
|
|
|
uint32_t pos)
|
2018-07-13 03:39:34 +08:00
|
|
|
{
|
|
|
|
uint32_t pack_int_id;
|
|
|
|
struct packed_git *p;
|
|
|
|
|
|
|
|
if (pos >= m->num_objects)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
pack_int_id = nth_midxed_pack_int_id(m, pos);
|
|
|
|
|
2019-04-30 00:18:55 +08:00
|
|
|
if (prepare_midx_pack(r, m, pack_int_id))
|
midx.c: protect against disappearing packs
When a packed object is stored in a multi-pack index, but that pack has
racily gone away, the MIDX code simply calls die(), when it could be
returning an error to the caller, which would in turn lead to
re-scanning the pack directory.
A pack can racily disappear, for example, due to a simultaneous 'git
repack -ad',
You can also reproduce this with two terminals, where one is running:
git init
while true; do
git commit -q --allow-empty -m foo
git repack -ad
git multi-pack-index write
done
(in effect, constantly writing new MIDXs), and the other is running:
obj=$(git rev-parse HEAD)
while true; do
echo $obj | git cat-file --batch-check='%(objectsize:disk)' || break
done
That will sometimes hit the error preparing packfile from
multi-pack-index message, which this patch fixes.
Right now, that path to discovering a missing pack looks something like
'find_pack_entry()' calling 'fill_midx_entry()' and eventually making
its way to call 'nth_midxed_pack_entry()'.
'nth_midxed_pack_entry()' already checks 'is_pack_valid()' and
propagates an error if the pack is invalid. So, this works if the pack
has gone away between calling 'prepare_midx_pack()' and before calling
'is_pack_valid()', but not if it disappears before then.
Catch the case where the pack has already disappeared before
'prepare_midx_pack()' by returning an error in that case, too.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-11-26 01:17:33 +08:00
|
|
|
return 0;
|
2018-07-13 03:39:34 +08:00
|
|
|
p = m->packs[pack_int_id];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are about to tell the caller where they can locate the
|
|
|
|
* requested object. We better make sure the packfile is
|
|
|
|
* still here and can be accessed before supplying that
|
|
|
|
* answer, as it may have been deleted since the MIDX was
|
|
|
|
* loaded!
|
|
|
|
*/
|
|
|
|
if (!is_pack_valid(p))
|
|
|
|
return 0;
|
|
|
|
|
2018-08-21 00:51:57 +08:00
|
|
|
if (p->num_bad_objects) {
|
|
|
|
uint32_t i;
|
|
|
|
struct object_id oid;
|
|
|
|
nth_midxed_object_oid(&oid, m, pos);
|
|
|
|
for (i = 0; i < p->num_bad_objects; i++)
|
2018-10-03 05:19:21 +08:00
|
|
|
if (hasheq(oid.hash,
|
|
|
|
p->bad_object_sha1 + the_hash_algo->rawsz * i))
|
2018-08-21 00:51:57 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-07-13 03:39:34 +08:00
|
|
|
e->offset = nth_midxed_offset(m, pos);
|
|
|
|
e->p = p;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2019-04-30 00:18:55 +08:00
|
|
|
int fill_midx_entry(struct repository * r,
|
|
|
|
const struct object_id *oid,
|
|
|
|
struct pack_entry *e,
|
|
|
|
struct multi_pack_index *m)
|
2018-07-13 03:39:34 +08:00
|
|
|
{
|
|
|
|
uint32_t pos;
|
|
|
|
|
|
|
|
if (!bsearch_midx(oid, m, &pos))
|
|
|
|
return 0;
|
|
|
|
|
2019-04-30 00:18:55 +08:00
|
|
|
return nth_midxed_pack_entry(r, m, e, pos);
|
2018-07-13 03:39:34 +08:00
|
|
|
}
|
|
|
|
|
midx: check both pack and index names for containment
A midx file (and the struct we parse from it) contains a list of all of
the covered packfiles, mentioned by their ".idx" names (e.g.,
"pack-1234.idx", etc). And thus calls to midx_contains_pack() expect
callers to provide the idx name.
This works for most of the calls, but the one in open_packed_git_1()
tries to feed a packed_git->pack_name, which is the ".pack" name,
meaning we'll never find a match (even if the pack is covered by the
midx).
We can fix this by converting the ".pack" to ".idx" in the caller.
However, that requires allocating a new string. Instead, let's make
midx_contains_pack() a bit friendlier, and allow it take _either_ the
.pack or .idx variant.
All cleverness in the matching code is credited to René. Bugs are mine.
There's no test here, because while this does fix _a_ bug, it's masked
by another bug in that same caller. That will be covered (with a test)
in the next patch.
Helped-by: René Scharfe <l.s.r@web.de>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-06 02:06:04 +08:00
|
|
|
/* Match "foo.idx" against either "foo.pack" _or_ "foo.idx". */
|
|
|
|
static int cmp_idx_or_pack_name(const char *idx_or_pack_name,
|
|
|
|
const char *idx_name)
|
|
|
|
{
|
|
|
|
/* Skip past any initial matching prefix. */
|
|
|
|
while (*idx_name && *idx_name == *idx_or_pack_name) {
|
|
|
|
idx_name++;
|
|
|
|
idx_or_pack_name++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we didn't match completely, we may have matched "pack-1234." and
|
|
|
|
* be left with "idx" and "pack" respectively, which is also OK. We do
|
|
|
|
* not have to check for "idx" and "idx", because that would have been
|
|
|
|
* a complete match (and in that case these strcmps will be false, but
|
|
|
|
* we'll correctly return 0 from the final strcmp() below.
|
|
|
|
*
|
|
|
|
* Technically this matches "fooidx" and "foopack", but we'd never have
|
|
|
|
* such names in the first place.
|
|
|
|
*/
|
|
|
|
if (!strcmp(idx_name, "idx") && !strcmp(idx_or_pack_name, "pack"))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This not only checks for a complete match, but also orders based on
|
|
|
|
* the first non-identical character, which means our ordering will
|
|
|
|
* match a raw strcmp(). That makes it OK to use this to binary search
|
|
|
|
* a naively-sorted list.
|
|
|
|
*/
|
|
|
|
return strcmp(idx_or_pack_name, idx_name);
|
|
|
|
}
|
|
|
|
|
|
|
|
int midx_contains_pack(struct multi_pack_index *m, const char *idx_or_pack_name)
|
2018-07-13 03:39:36 +08:00
|
|
|
{
|
|
|
|
uint32_t first = 0, last = m->num_packs;
|
|
|
|
|
|
|
|
while (first < last) {
|
|
|
|
uint32_t mid = first + (last - first) / 2;
|
|
|
|
const char *current;
|
|
|
|
int cmp;
|
|
|
|
|
|
|
|
current = m->pack_names[mid];
|
midx: check both pack and index names for containment
A midx file (and the struct we parse from it) contains a list of all of
the covered packfiles, mentioned by their ".idx" names (e.g.,
"pack-1234.idx", etc). And thus calls to midx_contains_pack() expect
callers to provide the idx name.
This works for most of the calls, but the one in open_packed_git_1()
tries to feed a packed_git->pack_name, which is the ".pack" name,
meaning we'll never find a match (even if the pack is covered by the
midx).
We can fix this by converting the ".pack" to ".idx" in the caller.
However, that requires allocating a new string. Instead, let's make
midx_contains_pack() a bit friendlier, and allow it take _either_ the
.pack or .idx variant.
All cleverness in the matching code is credited to René. Bugs are mine.
There's no test here, because while this does fix _a_ bug, it's masked
by another bug in that same caller. That will be covered (with a test)
in the next patch.
Helped-by: René Scharfe <l.s.r@web.de>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-04-06 02:06:04 +08:00
|
|
|
cmp = cmp_idx_or_pack_name(idx_or_pack_name, current);
|
2018-07-13 03:39:36 +08:00
|
|
|
if (!cmp)
|
|
|
|
return 1;
|
|
|
|
if (cmp > 0) {
|
|
|
|
first = mid + 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
last = mid;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-08-21 00:51:55 +08:00
|
|
|
int prepare_multi_pack_index_one(struct repository *r, const char *object_dir, int local)
|
2018-07-13 03:39:33 +08:00
|
|
|
{
|
2018-08-21 00:52:00 +08:00
|
|
|
struct multi_pack_index *m;
|
2018-07-13 03:39:33 +08:00
|
|
|
struct multi_pack_index *m_search;
|
|
|
|
|
2020-09-25 20:33:34 +08:00
|
|
|
prepare_repo_settings(r);
|
|
|
|
if (!r->settings.core_multi_pack_index)
|
2018-07-13 03:39:33 +08:00
|
|
|
return 0;
|
|
|
|
|
2018-08-21 00:52:00 +08:00
|
|
|
for (m_search = r->objects->multi_pack_index; m_search; m_search = m_search->next)
|
2018-07-13 03:39:33 +08:00
|
|
|
if (!strcmp(object_dir, m_search->object_dir))
|
|
|
|
return 1;
|
|
|
|
|
2018-08-21 00:52:00 +08:00
|
|
|
m = load_multi_pack_index(object_dir, local);
|
2018-07-13 03:39:33 +08:00
|
|
|
|
2018-08-21 00:52:00 +08:00
|
|
|
if (m) {
|
midx: traverse the local MIDX first
When a repository has an alternate object directory configured, callers
can traverse through each alternate's MIDX by walking the '->next'
pointer.
But, when 'prepare_multi_pack_index_one()' loads multiple MIDXs, it
places the new ones at the front of this pointer chain, not at the end.
This can be confusing for callers such as 'git repack -ad', causing test
failures like in t7700.6 with 'GIT_TEST_MULTI_PACK_INDEX=1'.
The occurs when dropping a pack known to the local MIDX with alternates
configured that have their own MIDX. Since the alternate's MIDX is
returned via 'get_multi_pack_index()', 'midx_contains_pack()' returns
true (which is correct, since it traverses through the '->next' pointer
to find the MIDX in the chain that does contain the requested object).
But, we call 'clear_midx_file()' on 'the_repository', which drops the
MIDX at the path of the first MIDX in the chain, which (in the case of
t7700.6 is the one in the alternate).
This patch addresses that by:
- placing the local MIDX first in the chain when calling
'prepare_multi_pack_index_one()', and
- introducing a new 'get_local_multi_pack_index()', which explicitly
returns the repository-local MIDX, if any.
Don't impose an additional order on the MIDX's '->next' pointer beyond
that the first item in the chain must be local if one exists so that we
avoid a quadratic insertion.
Likewise, use 'get_local_multi_pack_index()' in
'remove_redundant_pack()' to fix the formerly broken t7700.6 when run
with 'GIT_TEST_MULTI_PACK_INDEX=1'.
Finally, note that the MIDX ordering invariant is only preserved by the
insertion order in 'prepare_packed_git()', which traverses through the
ODB's '->next' pointer, meaning we visit the local object store first.
This fragility makes this an undesirable long-term solution if more
callers are added, but it is acceptable for now since this is the only
caller.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-29 04:22:13 +08:00
|
|
|
struct multi_pack_index *mp = r->objects->multi_pack_index;
|
|
|
|
if (mp) {
|
|
|
|
m->next = mp->next;
|
|
|
|
mp->next = m;
|
|
|
|
} else
|
|
|
|
r->objects->multi_pack_index = m;
|
2018-07-13 03:39:33 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-07-13 03:39:22 +08:00
|
|
|
static size_t write_midx_header(struct hashfile *f,
|
|
|
|
unsigned char num_chunks,
|
|
|
|
uint32_t num_packs)
|
|
|
|
{
|
|
|
|
hashwrite_be32(f, MIDX_SIGNATURE);
|
2020-09-06 16:59:02 +08:00
|
|
|
hashwrite_u8(f, MIDX_VERSION);
|
|
|
|
hashwrite_u8(f, oid_version());
|
|
|
|
hashwrite_u8(f, num_chunks);
|
|
|
|
hashwrite_u8(f, 0); /* unused */
|
2018-07-13 03:39:22 +08:00
|
|
|
hashwrite_be32(f, num_packs);
|
|
|
|
|
|
|
|
return MIDX_HEADER_SIZE;
|
|
|
|
}
|
|
|
|
|
2019-06-11 07:35:24 +08:00
|
|
|
struct pack_info {
|
|
|
|
uint32_t orig_pack_int_id;
|
|
|
|
char *pack_name;
|
|
|
|
struct packed_git *p;
|
2019-06-11 07:35:25 +08:00
|
|
|
unsigned expired : 1;
|
2019-06-11 07:35:24 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static int pack_info_compare(const void *_a, const void *_b)
|
|
|
|
{
|
|
|
|
struct pack_info *a = (struct pack_info *)_a;
|
|
|
|
struct pack_info *b = (struct pack_info *)_b;
|
|
|
|
return strcmp(a->pack_name, b->pack_name);
|
|
|
|
}
|
|
|
|
|
2021-03-30 23:04:11 +08:00
|
|
|
static int idx_or_pack_name_cmp(const void *_va, const void *_vb)
|
|
|
|
{
|
|
|
|
const char *pack_name = _va;
|
|
|
|
const struct pack_info *compar = _vb;
|
|
|
|
|
|
|
|
return cmp_idx_or_pack_name(pack_name, compar->pack_name);
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
struct write_midx_context {
|
2019-06-11 07:35:24 +08:00
|
|
|
struct pack_info *info;
|
2018-07-13 03:39:26 +08:00
|
|
|
uint32_t nr;
|
2019-06-11 07:35:24 +08:00
|
|
|
uint32_t alloc;
|
2018-07-13 03:39:36 +08:00
|
|
|
struct multi_pack_index *m;
|
2019-10-22 02:39:59 +08:00
|
|
|
struct progress *progress;
|
|
|
|
unsigned pack_paths_checked;
|
2021-02-18 22:07:28 +08:00
|
|
|
|
|
|
|
struct pack_midx_entry *entries;
|
|
|
|
uint32_t entries_nr;
|
2021-02-18 22:07:29 +08:00
|
|
|
|
|
|
|
uint32_t *pack_perm;
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
uint32_t *pack_order;
|
2021-02-18 22:07:29 +08:00
|
|
|
unsigned large_offsets_needed:1;
|
2021-02-18 22:07:30 +08:00
|
|
|
uint32_t num_large_offsets;
|
2021-03-30 23:04:11 +08:00
|
|
|
|
|
|
|
int preferred_pack_idx;
|
2018-07-13 03:39:26 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static void add_pack_to_midx(const char *full_path, size_t full_path_len,
|
|
|
|
const char *file_name, void *data)
|
|
|
|
{
|
2021-02-18 22:07:26 +08:00
|
|
|
struct write_midx_context *ctx = data;
|
2018-07-13 03:39:26 +08:00
|
|
|
|
|
|
|
if (ends_with(file_name, ".idx")) {
|
2021-02-18 22:07:26 +08:00
|
|
|
display_progress(ctx->progress, ++ctx->pack_paths_checked);
|
|
|
|
if (ctx->m && midx_contains_pack(ctx->m, file_name))
|
2018-07-13 03:39:36 +08:00
|
|
|
return;
|
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
ALLOC_GROW(ctx->info, ctx->nr + 1, ctx->alloc);
|
2018-07-13 03:39:26 +08:00
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
ctx->info[ctx->nr].p = add_packed_git(full_path,
|
|
|
|
full_path_len,
|
|
|
|
0);
|
2018-07-13 03:39:29 +08:00
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
if (!ctx->info[ctx->nr].p) {
|
2018-07-13 03:39:26 +08:00
|
|
|
warning(_("failed to add packfile '%s'"),
|
|
|
|
full_path);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
if (open_pack_index(ctx->info[ctx->nr].p)) {
|
2018-07-13 03:39:29 +08:00
|
|
|
warning(_("failed to open pack-index '%s'"),
|
|
|
|
full_path);
|
2021-02-18 22:07:26 +08:00
|
|
|
close_pack(ctx->info[ctx->nr].p);
|
|
|
|
FREE_AND_NULL(ctx->info[ctx->nr].p);
|
2018-07-13 03:39:29 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
ctx->info[ctx->nr].pack_name = xstrdup(file_name);
|
|
|
|
ctx->info[ctx->nr].orig_pack_int_id = ctx->nr;
|
|
|
|
ctx->info[ctx->nr].expired = 0;
|
|
|
|
ctx->nr++;
|
2018-07-13 03:39:26 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-13 03:39:29 +08:00
|
|
|
struct pack_midx_entry {
|
|
|
|
struct object_id oid;
|
|
|
|
uint32_t pack_int_id;
|
|
|
|
time_t pack_mtime;
|
|
|
|
uint64_t offset;
|
2021-03-30 23:04:11 +08:00
|
|
|
unsigned preferred : 1;
|
2018-07-13 03:39:29 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static int midx_oid_compare(const void *_a, const void *_b)
|
|
|
|
{
|
|
|
|
const struct pack_midx_entry *a = (const struct pack_midx_entry *)_a;
|
|
|
|
const struct pack_midx_entry *b = (const struct pack_midx_entry *)_b;
|
|
|
|
int cmp = oidcmp(&a->oid, &b->oid);
|
|
|
|
|
|
|
|
if (cmp)
|
|
|
|
return cmp;
|
|
|
|
|
2021-03-30 23:04:11 +08:00
|
|
|
/* Sort objects in a preferred pack first when multiple copies exist. */
|
|
|
|
if (a->preferred > b->preferred)
|
|
|
|
return -1;
|
|
|
|
if (a->preferred < b->preferred)
|
|
|
|
return 1;
|
|
|
|
|
2018-07-13 03:39:29 +08:00
|
|
|
if (a->pack_mtime > b->pack_mtime)
|
|
|
|
return -1;
|
|
|
|
else if (a->pack_mtime < b->pack_mtime)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return a->pack_int_id - b->pack_int_id;
|
|
|
|
}
|
|
|
|
|
2018-07-13 03:39:36 +08:00
|
|
|
static int nth_midxed_pack_midx_entry(struct multi_pack_index *m,
|
|
|
|
struct pack_midx_entry *e,
|
|
|
|
uint32_t pos)
|
|
|
|
{
|
|
|
|
if (pos >= m->num_objects)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
nth_midxed_object_oid(&e->oid, m, pos);
|
2019-06-11 07:35:24 +08:00
|
|
|
e->pack_int_id = nth_midxed_pack_int_id(m, pos);
|
2018-07-13 03:39:36 +08:00
|
|
|
e->offset = nth_midxed_offset(m, pos);
|
|
|
|
|
|
|
|
/* consider objects in midx to be from "old" packs */
|
|
|
|
e->pack_mtime = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-07-13 03:39:29 +08:00
|
|
|
static void fill_pack_entry(uint32_t pack_int_id,
|
|
|
|
struct packed_git *p,
|
|
|
|
uint32_t cur_object,
|
2021-03-30 23:04:11 +08:00
|
|
|
struct pack_midx_entry *entry,
|
|
|
|
int preferred)
|
2018-07-13 03:39:29 +08:00
|
|
|
{
|
2020-02-24 12:27:36 +08:00
|
|
|
if (nth_packed_object_id(&entry->oid, p, cur_object) < 0)
|
2018-07-13 03:39:29 +08:00
|
|
|
die(_("failed to locate object %d in packfile"), cur_object);
|
|
|
|
|
|
|
|
entry->pack_int_id = pack_int_id;
|
|
|
|
entry->pack_mtime = p->mtime;
|
|
|
|
|
|
|
|
entry->offset = nth_packed_object_offset(p, cur_object);
|
2021-03-30 23:04:11 +08:00
|
|
|
entry->preferred = !!preferred;
|
2018-07-13 03:39:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* It is possible to artificially get into a state where there are many
|
|
|
|
* duplicate copies of objects. That can create high memory pressure if
|
|
|
|
* we are to create a list of all objects before de-duplication. To reduce
|
|
|
|
* this memory pressure without a significant performance drop, automatically
|
|
|
|
* group objects by the first byte of their object id. Use the IDX fanout
|
|
|
|
* tables to group the data, copy to a local array, then sort.
|
|
|
|
*
|
|
|
|
* Copy only the de-duplicated entries (selected by most-recent modified time
|
|
|
|
* of a packfile containing the object).
|
|
|
|
*/
|
2018-07-13 03:39:36 +08:00
|
|
|
static struct pack_midx_entry *get_sorted_entries(struct multi_pack_index *m,
|
2019-06-11 07:35:24 +08:00
|
|
|
struct pack_info *info,
|
2018-07-13 03:39:29 +08:00
|
|
|
uint32_t nr_packs,
|
2021-03-30 23:04:11 +08:00
|
|
|
uint32_t *nr_objects,
|
|
|
|
int preferred_pack)
|
2018-07-13 03:39:29 +08:00
|
|
|
{
|
|
|
|
uint32_t cur_fanout, cur_pack, cur_object;
|
|
|
|
uint32_t alloc_fanout, alloc_objects, total_objects = 0;
|
|
|
|
struct pack_midx_entry *entries_by_fanout = NULL;
|
|
|
|
struct pack_midx_entry *deduplicated_entries = NULL;
|
2018-07-13 03:39:36 +08:00
|
|
|
uint32_t start_pack = m ? m->num_packs : 0;
|
2018-07-13 03:39:29 +08:00
|
|
|
|
2018-07-13 03:39:36 +08:00
|
|
|
for (cur_pack = start_pack; cur_pack < nr_packs; cur_pack++)
|
2019-06-11 07:35:24 +08:00
|
|
|
total_objects += info[cur_pack].p->num_objects;
|
2018-07-13 03:39:29 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* As we de-duplicate by fanout value, we expect the fanout
|
|
|
|
* slices to be evenly distributed, with some noise. Hence,
|
|
|
|
* allocate slightly more than one 256th.
|
|
|
|
*/
|
|
|
|
alloc_objects = alloc_fanout = total_objects > 3200 ? total_objects / 200 : 16;
|
|
|
|
|
|
|
|
ALLOC_ARRAY(entries_by_fanout, alloc_fanout);
|
|
|
|
ALLOC_ARRAY(deduplicated_entries, alloc_objects);
|
|
|
|
*nr_objects = 0;
|
|
|
|
|
|
|
|
for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) {
|
|
|
|
uint32_t nr_fanout = 0;
|
|
|
|
|
2018-07-13 03:39:36 +08:00
|
|
|
if (m) {
|
|
|
|
uint32_t start = 0, end;
|
|
|
|
|
|
|
|
if (cur_fanout)
|
|
|
|
start = ntohl(m->chunk_oid_fanout[cur_fanout - 1]);
|
|
|
|
end = ntohl(m->chunk_oid_fanout[cur_fanout]);
|
|
|
|
|
|
|
|
for (cur_object = start; cur_object < end; cur_object++) {
|
|
|
|
ALLOC_GROW(entries_by_fanout, nr_fanout + 1, alloc_fanout);
|
2019-06-11 07:35:24 +08:00
|
|
|
nth_midxed_pack_midx_entry(m,
|
2018-07-13 03:39:36 +08:00
|
|
|
&entries_by_fanout[nr_fanout],
|
|
|
|
cur_object);
|
2021-03-30 23:04:11 +08:00
|
|
|
if (nth_midxed_pack_int_id(m, cur_object) == preferred_pack)
|
|
|
|
entries_by_fanout[nr_fanout].preferred = 1;
|
|
|
|
else
|
|
|
|
entries_by_fanout[nr_fanout].preferred = 0;
|
2018-07-13 03:39:36 +08:00
|
|
|
nr_fanout++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (cur_pack = start_pack; cur_pack < nr_packs; cur_pack++) {
|
2018-07-13 03:39:29 +08:00
|
|
|
uint32_t start = 0, end;
|
2021-03-30 23:04:11 +08:00
|
|
|
int preferred = cur_pack == preferred_pack;
|
2018-07-13 03:39:29 +08:00
|
|
|
|
|
|
|
if (cur_fanout)
|
2019-06-11 07:35:24 +08:00
|
|
|
start = get_pack_fanout(info[cur_pack].p, cur_fanout - 1);
|
|
|
|
end = get_pack_fanout(info[cur_pack].p, cur_fanout);
|
2018-07-13 03:39:29 +08:00
|
|
|
|
|
|
|
for (cur_object = start; cur_object < end; cur_object++) {
|
|
|
|
ALLOC_GROW(entries_by_fanout, nr_fanout + 1, alloc_fanout);
|
2021-03-30 23:04:11 +08:00
|
|
|
fill_pack_entry(cur_pack,
|
|
|
|
info[cur_pack].p,
|
|
|
|
cur_object,
|
|
|
|
&entries_by_fanout[nr_fanout],
|
|
|
|
preferred);
|
2018-07-13 03:39:29 +08:00
|
|
|
nr_fanout++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
QSORT(entries_by_fanout, nr_fanout, midx_oid_compare);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The batch is now sorted by OID and then mtime (descending).
|
|
|
|
* Take only the first duplicate.
|
|
|
|
*/
|
|
|
|
for (cur_object = 0; cur_object < nr_fanout; cur_object++) {
|
2018-10-03 05:19:21 +08:00
|
|
|
if (cur_object && oideq(&entries_by_fanout[cur_object - 1].oid,
|
|
|
|
&entries_by_fanout[cur_object].oid))
|
2018-07-13 03:39:29 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
ALLOC_GROW(deduplicated_entries, *nr_objects + 1, alloc_objects);
|
|
|
|
memcpy(&deduplicated_entries[*nr_objects],
|
|
|
|
&entries_by_fanout[cur_object],
|
|
|
|
sizeof(struct pack_midx_entry));
|
|
|
|
(*nr_objects)++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
free(entries_by_fanout);
|
|
|
|
return deduplicated_entries;
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:31 +08:00
|
|
|
static int write_midx_pack_names(struct hashfile *f, void *data)
|
2018-07-13 03:39:27 +08:00
|
|
|
{
|
2021-02-18 22:07:27 +08:00
|
|
|
struct write_midx_context *ctx = data;
|
2018-07-13 03:39:27 +08:00
|
|
|
uint32_t i;
|
|
|
|
unsigned char padding[MIDX_CHUNK_ALIGNMENT];
|
|
|
|
size_t written = 0;
|
|
|
|
|
2021-02-18 22:07:27 +08:00
|
|
|
for (i = 0; i < ctx->nr; i++) {
|
2019-06-11 07:35:25 +08:00
|
|
|
size_t writelen;
|
|
|
|
|
2021-02-18 22:07:27 +08:00
|
|
|
if (ctx->info[i].expired)
|
2019-06-11 07:35:25 +08:00
|
|
|
continue;
|
2018-07-13 03:39:27 +08:00
|
|
|
|
2021-02-18 22:07:27 +08:00
|
|
|
if (i && strcmp(ctx->info[i].pack_name, ctx->info[i - 1].pack_name) <= 0)
|
2018-07-13 03:39:27 +08:00
|
|
|
BUG("incorrect pack-file order: %s before %s",
|
2021-02-18 22:07:27 +08:00
|
|
|
ctx->info[i - 1].pack_name,
|
|
|
|
ctx->info[i].pack_name);
|
2018-07-13 03:39:27 +08:00
|
|
|
|
2021-02-18 22:07:27 +08:00
|
|
|
writelen = strlen(ctx->info[i].pack_name) + 1;
|
|
|
|
hashwrite(f, ctx->info[i].pack_name, writelen);
|
2018-07-13 03:39:27 +08:00
|
|
|
written += writelen;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* add padding to be aligned */
|
|
|
|
i = MIDX_CHUNK_ALIGNMENT - (written % MIDX_CHUNK_ALIGNMENT);
|
|
|
|
if (i < MIDX_CHUNK_ALIGNMENT) {
|
|
|
|
memset(padding, 0, sizeof(padding));
|
|
|
|
hashwrite(f, padding, i);
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:31 +08:00
|
|
|
return 0;
|
2018-07-13 03:39:27 +08:00
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:31 +08:00
|
|
|
static int write_midx_oid_fanout(struct hashfile *f,
|
|
|
|
void *data)
|
2018-07-13 03:39:31 +08:00
|
|
|
{
|
2021-02-18 22:07:28 +08:00
|
|
|
struct write_midx_context *ctx = data;
|
|
|
|
struct pack_midx_entry *list = ctx->entries;
|
|
|
|
struct pack_midx_entry *last = ctx->entries + ctx->entries_nr;
|
2018-07-13 03:39:31 +08:00
|
|
|
uint32_t count = 0;
|
|
|
|
uint32_t i;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write the first-level table (the list is sorted,
|
|
|
|
* but we use a 256-entry lookup to be able to avoid
|
|
|
|
* having to do eight extra binary search iterations).
|
|
|
|
*/
|
|
|
|
for (i = 0; i < 256; i++) {
|
|
|
|
struct pack_midx_entry *next = list;
|
|
|
|
|
|
|
|
while (next < last && next->oid.hash[0] == i) {
|
|
|
|
count++;
|
|
|
|
next++;
|
|
|
|
}
|
|
|
|
|
|
|
|
hashwrite_be32(f, count);
|
|
|
|
list = next;
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:31 +08:00
|
|
|
return 0;
|
2018-07-13 03:39:31 +08:00
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:31 +08:00
|
|
|
static int write_midx_oid_lookup(struct hashfile *f,
|
|
|
|
void *data)
|
2018-07-13 03:39:30 +08:00
|
|
|
{
|
2021-02-18 22:07:28 +08:00
|
|
|
struct write_midx_context *ctx = data;
|
|
|
|
unsigned char hash_len = the_hash_algo->rawsz;
|
|
|
|
struct pack_midx_entry *list = ctx->entries;
|
2018-07-13 03:39:30 +08:00
|
|
|
uint32_t i;
|
|
|
|
|
2021-02-18 22:07:28 +08:00
|
|
|
for (i = 0; i < ctx->entries_nr; i++) {
|
2018-07-13 03:39:30 +08:00
|
|
|
struct pack_midx_entry *obj = list++;
|
|
|
|
|
2021-02-18 22:07:28 +08:00
|
|
|
if (i < ctx->entries_nr - 1) {
|
2018-07-13 03:39:30 +08:00
|
|
|
struct pack_midx_entry *next = list;
|
|
|
|
if (oidcmp(&obj->oid, &next->oid) >= 0)
|
|
|
|
BUG("OIDs not in order: %s >= %s",
|
|
|
|
oid_to_hex(&obj->oid),
|
|
|
|
oid_to_hex(&next->oid));
|
|
|
|
}
|
|
|
|
|
|
|
|
hashwrite(f, obj->oid.hash, (int)hash_len);
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:31 +08:00
|
|
|
return 0;
|
2018-07-13 03:39:30 +08:00
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:31 +08:00
|
|
|
static int write_midx_object_offsets(struct hashfile *f,
|
|
|
|
void *data)
|
2018-07-13 03:39:32 +08:00
|
|
|
{
|
2021-02-18 22:07:29 +08:00
|
|
|
struct write_midx_context *ctx = data;
|
|
|
|
struct pack_midx_entry *list = ctx->entries;
|
2018-07-13 03:39:32 +08:00
|
|
|
uint32_t i, nr_large_offset = 0;
|
|
|
|
|
2021-02-18 22:07:29 +08:00
|
|
|
for (i = 0; i < ctx->entries_nr; i++) {
|
2018-07-13 03:39:32 +08:00
|
|
|
struct pack_midx_entry *obj = list++;
|
|
|
|
|
2021-02-18 22:07:29 +08:00
|
|
|
if (ctx->pack_perm[obj->pack_int_id] == PACK_EXPIRED)
|
2019-06-11 07:35:25 +08:00
|
|
|
BUG("object %s is in an expired pack with int-id %d",
|
|
|
|
oid_to_hex(&obj->oid),
|
|
|
|
obj->pack_int_id);
|
|
|
|
|
2021-02-18 22:07:29 +08:00
|
|
|
hashwrite_be32(f, ctx->pack_perm[obj->pack_int_id]);
|
2018-07-13 03:39:32 +08:00
|
|
|
|
2021-02-18 22:07:29 +08:00
|
|
|
if (ctx->large_offsets_needed && obj->offset >> 31)
|
2018-07-13 03:39:32 +08:00
|
|
|
hashwrite_be32(f, MIDX_LARGE_OFFSET_NEEDED | nr_large_offset++);
|
2021-02-18 22:07:29 +08:00
|
|
|
else if (!ctx->large_offsets_needed && obj->offset >> 32)
|
2018-07-13 03:39:32 +08:00
|
|
|
BUG("object %s requires a large offset (%"PRIx64") but the MIDX is not writing large offsets!",
|
|
|
|
oid_to_hex(&obj->oid),
|
|
|
|
obj->offset);
|
|
|
|
else
|
|
|
|
hashwrite_be32(f, (uint32_t)obj->offset);
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:31 +08:00
|
|
|
return 0;
|
2018-07-13 03:39:32 +08:00
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:31 +08:00
|
|
|
static int write_midx_large_offsets(struct hashfile *f,
|
|
|
|
void *data)
|
2018-07-13 03:39:32 +08:00
|
|
|
{
|
2021-02-18 22:07:30 +08:00
|
|
|
struct write_midx_context *ctx = data;
|
|
|
|
struct pack_midx_entry *list = ctx->entries;
|
|
|
|
struct pack_midx_entry *end = ctx->entries + ctx->entries_nr;
|
|
|
|
uint32_t nr_large_offset = ctx->num_large_offsets;
|
2018-07-13 03:39:32 +08:00
|
|
|
|
|
|
|
while (nr_large_offset) {
|
2018-11-04 10:27:46 +08:00
|
|
|
struct pack_midx_entry *obj;
|
|
|
|
uint64_t offset;
|
|
|
|
|
|
|
|
if (list >= end)
|
|
|
|
BUG("too many large-offset objects");
|
|
|
|
|
|
|
|
obj = list++;
|
|
|
|
offset = obj->offset;
|
2018-07-13 03:39:32 +08:00
|
|
|
|
|
|
|
if (!(offset >> 31))
|
|
|
|
continue;
|
|
|
|
|
2021-02-18 22:07:31 +08:00
|
|
|
hashwrite_be64(f, offset);
|
2018-07-13 03:39:32 +08:00
|
|
|
|
|
|
|
nr_large_offset--;
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:31 +08:00
|
|
|
return 0;
|
2018-07-13 03:39:32 +08:00
|
|
|
}
|
|
|
|
|
2021-03-30 23:04:36 +08:00
|
|
|
struct midx_pack_order_data {
|
|
|
|
uint32_t nr;
|
|
|
|
uint32_t pack;
|
|
|
|
off_t offset;
|
|
|
|
};
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
|
2021-03-30 23:04:36 +08:00
|
|
|
static int midx_pack_order_cmp(const void *va, const void *vb)
|
|
|
|
{
|
|
|
|
const struct midx_pack_order_data *a = va, *b = vb;
|
|
|
|
if (a->pack < b->pack)
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
return -1;
|
2021-03-30 23:04:36 +08:00
|
|
|
else if (a->pack > b->pack)
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
return 1;
|
2021-03-30 23:04:36 +08:00
|
|
|
else if (a->offset < b->offset)
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
return -1;
|
2021-03-30 23:04:36 +08:00
|
|
|
else if (a->offset > b->offset)
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
return 1;
|
2021-03-30 23:04:36 +08:00
|
|
|
else
|
|
|
|
return 0;
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t *midx_pack_order(struct write_midx_context *ctx)
|
|
|
|
{
|
2021-03-30 23:04:36 +08:00
|
|
|
struct midx_pack_order_data *data;
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
uint32_t *pack_order;
|
|
|
|
uint32_t i;
|
|
|
|
|
2021-03-30 23:04:36 +08:00
|
|
|
ALLOC_ARRAY(data, ctx->entries_nr);
|
|
|
|
for (i = 0; i < ctx->entries_nr; i++) {
|
|
|
|
struct pack_midx_entry *e = &ctx->entries[i];
|
|
|
|
data[i].nr = i;
|
|
|
|
data[i].pack = ctx->pack_perm[e->pack_int_id];
|
|
|
|
if (!e->preferred)
|
|
|
|
data[i].pack |= (1U << 31);
|
|
|
|
data[i].offset = e->offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
QSORT(data, ctx->entries_nr, midx_pack_order_cmp);
|
|
|
|
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
ALLOC_ARRAY(pack_order, ctx->entries_nr);
|
|
|
|
for (i = 0; i < ctx->entries_nr; i++)
|
2021-03-30 23:04:36 +08:00
|
|
|
pack_order[i] = data[i].nr;
|
|
|
|
free(data);
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
|
|
|
|
return pack_order;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void write_midx_reverse_index(char *midx_name, unsigned char *midx_hash,
|
|
|
|
struct write_midx_context *ctx)
|
|
|
|
{
|
|
|
|
struct strbuf buf = STRBUF_INIT;
|
|
|
|
const char *tmp_file;
|
|
|
|
|
|
|
|
strbuf_addf(&buf, "%s-%s.rev", midx_name, hash_to_hex(midx_hash));
|
|
|
|
|
|
|
|
tmp_file = write_rev_file_order(NULL, ctx->pack_order, ctx->entries_nr,
|
|
|
|
midx_hash, WRITE_REV);
|
|
|
|
|
|
|
|
if (finalize_object_file(tmp_file, buf.buf))
|
|
|
|
die(_("cannot store reverse index file"));
|
|
|
|
|
|
|
|
strbuf_release(&buf);
|
|
|
|
}
|
|
|
|
|
2021-09-01 04:51:55 +08:00
|
|
|
static void clear_midx_files_ext(const char *object_dir, const char *ext,
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
unsigned char *keep_hash);
|
|
|
|
|
midx: don't reuse corrupt MIDXs when writing
When writing a new multi-pack index, Git tries to reuse as much of the
data from an existing MIDX as possible, like object offsets. This is
done to avoid re-opening a bunch of *.idx files unnecessarily, but can
lead to problems if the data we are reusing is corrupt.
That's because we'll blindly reuse data from an existing MIDX without
checking its trailing checksum for validity. So if there is memory
corruption while writing a MIDX, or disk corruption in the intervening
period between writing and reuse, we'll blindly propagate those bad
values forward.
Suppose we experience a memory corruption while writing a MIDX such that
we write an incorrect object offset (or alternatively, the disk corrupts
the data after being written, but before being reused). Then when we go
to write a new MIDX, we'll reuse the bad object offset without checking
its validity. This means that the MIDX we just wrote is broken, but its
trailing checksum is in-tact, since we never bothered to look at the
values before writing.
In the above, a "git multi-pack-index verify" would have caught the
problem before writing, but writing a new MIDX wouldn't have noticed
anything wrong, blindly carrying forward the corrupt offset.
Individual pack indexes check their validity by verifying the crc32
attached to each entry when carrying data forward during a repack.
We could solve this problem for MIDXs in the same way, but individual
crc32's don't make much sense, since their entries are so small.
Likewise, checking the whole file on every read may be prohibitively
expensive if a repository has a lot of objects, packs, or both.
But we can check the trailing checksum when reusing an existing MIDX
when writing a new one. And a corrupt MIDX need not stop us from writing
a new one, since we can just avoid reusing the existing one at all and
pretend as if we are writing a new MIDX from scratch.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-06-24 02:39:12 +08:00
|
|
|
static int midx_checksum_valid(struct multi_pack_index *m)
|
|
|
|
{
|
|
|
|
return hashfile_checksum_valid(m->data, m->data_len);
|
|
|
|
}
|
|
|
|
|
midx: avoid opening multiple MIDXs when writing
Opening multiple instance of the same MIDX can lead to problems like two
separate packed_git structures which represent the same pack being added
to the repository's object store.
The above scenario can happen because prepare_midx_pack() checks if
`m->packs[pack_int_id]` is NULL in order to determine if a pack has been
opened and installed in the repository before. But a caller can
construct two copies of the same MIDX by calling get_multi_pack_index()
and load_multi_pack_index() since the former manipulates the
object store directly but the latter is a lower-level routine which
allocates a new MIDX for each call.
So if prepare_midx_pack() is called on multiple MIDXs with the same
pack_int_id, then that pack will be installed twice in the object
store's packed_git pointer.
This can lead to problems in, for e.g., the pack-bitmap code, which does
something like the following (in pack-bitmap.c:open_pack_bitmap()):
struct bitmap_index *bitmap_git = ...;
for (p = get_all_packs(r); p; p = p->next) {
if (open_pack_bitmap_1(bitmap_git, p) == 0)
ret = 0;
}
which is a problem if two copies of the same pack exist in the
packed_git list because pack-bitmap.c:open_pack_bitmap_1() contains a
conditional like the following:
if (bitmap_git->pack || bitmap_git->midx) {
/* ignore extra bitmap file; we can only handle one */
warning("ignoring extra bitmap file: %s", packfile->pack_name);
close(fd);
return -1;
}
Avoid this scenario by not letting write_midx_internal() open a MIDX
that isn't also pointed at by the object store. So long as this is the
case, other routines should prefer to open MIDXs with
get_multi_pack_index() or reprepare_packed_git() instead of creating
instances on their own. Because get_multi_pack_index() returns
`r->object_store->multi_pack_index` if it is non-NULL, we'll only have
one instance of a MIDX open at one time, avoiding these problems.
To encourage this, drop the `struct multi_pack_index *` parameter from
`write_midx_internal()`, and rely instead on the `object_dir` to find
(or initialize) the correct MIDX instance.
Likewise, replace the call to `close_midx()` with
`close_object_store()`, since we're about to replace the MIDX with a new
one and should invalidate the object store's memory of any MIDX that
might have existed beforehand.
Note that this now forbids passing object directories that don't belong
to alternate repositories over `--object-dir`, since before we would
have happily opened a MIDX in any directory, but now restrict ourselves
to only those reachable by `r->objects->multi_pack_index` (and alternate
MIDXs that we can see by walking the `next` pointer).
As far as I can tell, supporting arbitrary directories with
`--object-dir` was a historical accident, since even the documentation
says `<alt>` when referring to the value passed to this option.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-02 04:34:01 +08:00
|
|
|
static int write_midx_internal(const char *object_dir,
|
2021-03-30 23:04:11 +08:00
|
|
|
struct string_list *packs_to_drop,
|
|
|
|
const char *preferred_pack_name,
|
|
|
|
unsigned flags)
|
2018-07-13 03:39:21 +08:00
|
|
|
{
|
2018-07-13 03:39:22 +08:00
|
|
|
char *midx_name;
|
2021-03-30 23:04:17 +08:00
|
|
|
unsigned char midx_hash[GIT_MAX_RAWSZ];
|
2018-07-13 03:39:26 +08:00
|
|
|
uint32_t i;
|
2018-07-13 03:39:22 +08:00
|
|
|
struct hashfile *f = NULL;
|
|
|
|
struct lock_file lk;
|
2021-02-18 22:07:26 +08:00
|
|
|
struct write_midx_context ctx = { 0 };
|
midx: avoid opening multiple MIDXs when writing
Opening multiple instance of the same MIDX can lead to problems like two
separate packed_git structures which represent the same pack being added
to the repository's object store.
The above scenario can happen because prepare_midx_pack() checks if
`m->packs[pack_int_id]` is NULL in order to determine if a pack has been
opened and installed in the repository before. But a caller can
construct two copies of the same MIDX by calling get_multi_pack_index()
and load_multi_pack_index() since the former manipulates the
object store directly but the latter is a lower-level routine which
allocates a new MIDX for each call.
So if prepare_midx_pack() is called on multiple MIDXs with the same
pack_int_id, then that pack will be installed twice in the object
store's packed_git pointer.
This can lead to problems in, for e.g., the pack-bitmap code, which does
something like the following (in pack-bitmap.c:open_pack_bitmap()):
struct bitmap_index *bitmap_git = ...;
for (p = get_all_packs(r); p; p = p->next) {
if (open_pack_bitmap_1(bitmap_git, p) == 0)
ret = 0;
}
which is a problem if two copies of the same pack exist in the
packed_git list because pack-bitmap.c:open_pack_bitmap_1() contains a
conditional like the following:
if (bitmap_git->pack || bitmap_git->midx) {
/* ignore extra bitmap file; we can only handle one */
warning("ignoring extra bitmap file: %s", packfile->pack_name);
close(fd);
return -1;
}
Avoid this scenario by not letting write_midx_internal() open a MIDX
that isn't also pointed at by the object store. So long as this is the
case, other routines should prefer to open MIDXs with
get_multi_pack_index() or reprepare_packed_git() instead of creating
instances on their own. Because get_multi_pack_index() returns
`r->object_store->multi_pack_index` if it is non-NULL, we'll only have
one instance of a MIDX open at one time, avoiding these problems.
To encourage this, drop the `struct multi_pack_index *` parameter from
`write_midx_internal()`, and rely instead on the `object_dir` to find
(or initialize) the correct MIDX instance.
Likewise, replace the call to `close_midx()` with
`close_object_store()`, since we're about to replace the MIDX with a new
one and should invalidate the object store's memory of any MIDX that
might have existed beforehand.
Note that this now forbids passing object directories that don't belong
to alternate repositories over `--object-dir`, since before we would
have happily opened a MIDX in any directory, but now restrict ourselves
to only those reachable by `r->objects->multi_pack_index` (and alternate
MIDXs that we can see by walking the `next` pointer).
As far as I can tell, supporting arbitrary directories with
`--object-dir` was a historical accident, since even the documentation
says `<alt>` when referring to the value passed to this option.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-02 04:34:01 +08:00
|
|
|
struct multi_pack_index *cur;
|
2019-06-11 07:35:24 +08:00
|
|
|
int pack_name_concat_len = 0;
|
2019-06-11 07:35:25 +08:00
|
|
|
int dropped_packs = 0;
|
|
|
|
int result = 0;
|
2021-02-18 22:07:33 +08:00
|
|
|
struct chunkfile *cf;
|
2018-07-13 03:39:22 +08:00
|
|
|
|
midx: avoid opening multiple MIDXs when writing
Opening multiple instance of the same MIDX can lead to problems like two
separate packed_git structures which represent the same pack being added
to the repository's object store.
The above scenario can happen because prepare_midx_pack() checks if
`m->packs[pack_int_id]` is NULL in order to determine if a pack has been
opened and installed in the repository before. But a caller can
construct two copies of the same MIDX by calling get_multi_pack_index()
and load_multi_pack_index() since the former manipulates the
object store directly but the latter is a lower-level routine which
allocates a new MIDX for each call.
So if prepare_midx_pack() is called on multiple MIDXs with the same
pack_int_id, then that pack will be installed twice in the object
store's packed_git pointer.
This can lead to problems in, for e.g., the pack-bitmap code, which does
something like the following (in pack-bitmap.c:open_pack_bitmap()):
struct bitmap_index *bitmap_git = ...;
for (p = get_all_packs(r); p; p = p->next) {
if (open_pack_bitmap_1(bitmap_git, p) == 0)
ret = 0;
}
which is a problem if two copies of the same pack exist in the
packed_git list because pack-bitmap.c:open_pack_bitmap_1() contains a
conditional like the following:
if (bitmap_git->pack || bitmap_git->midx) {
/* ignore extra bitmap file; we can only handle one */
warning("ignoring extra bitmap file: %s", packfile->pack_name);
close(fd);
return -1;
}
Avoid this scenario by not letting write_midx_internal() open a MIDX
that isn't also pointed at by the object store. So long as this is the
case, other routines should prefer to open MIDXs with
get_multi_pack_index() or reprepare_packed_git() instead of creating
instances on their own. Because get_multi_pack_index() returns
`r->object_store->multi_pack_index` if it is non-NULL, we'll only have
one instance of a MIDX open at one time, avoiding these problems.
To encourage this, drop the `struct multi_pack_index *` parameter from
`write_midx_internal()`, and rely instead on the `object_dir` to find
(or initialize) the correct MIDX instance.
Likewise, replace the call to `close_midx()` with
`close_object_store()`, since we're about to replace the MIDX with a new
one and should invalidate the object store's memory of any MIDX that
might have existed beforehand.
Note that this now forbids passing object directories that don't belong
to alternate repositories over `--object-dir`, since before we would
have happily opened a MIDX in any directory, but now restrict ourselves
to only those reachable by `r->objects->multi_pack_index` (and alternate
MIDXs that we can see by walking the `next` pointer).
As far as I can tell, supporting arbitrary directories with
`--object-dir` was a historical accident, since even the documentation
says `<alt>` when referring to the value passed to this option.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-02 04:34:01 +08:00
|
|
|
/* Ensure the given object_dir is local, or a known alternate. */
|
|
|
|
find_odb(the_repository, object_dir);
|
|
|
|
|
2018-07-13 03:39:22 +08:00
|
|
|
midx_name = get_midx_filename(object_dir);
|
2020-08-13 23:55:00 +08:00
|
|
|
if (safe_create_leading_directories(midx_name))
|
2018-07-13 03:39:22 +08:00
|
|
|
die_errno(_("unable to create leading directories of %s"),
|
|
|
|
midx_name);
|
|
|
|
|
midx: avoid opening multiple MIDXs when writing
Opening multiple instance of the same MIDX can lead to problems like two
separate packed_git structures which represent the same pack being added
to the repository's object store.
The above scenario can happen because prepare_midx_pack() checks if
`m->packs[pack_int_id]` is NULL in order to determine if a pack has been
opened and installed in the repository before. But a caller can
construct two copies of the same MIDX by calling get_multi_pack_index()
and load_multi_pack_index() since the former manipulates the
object store directly but the latter is a lower-level routine which
allocates a new MIDX for each call.
So if prepare_midx_pack() is called on multiple MIDXs with the same
pack_int_id, then that pack will be installed twice in the object
store's packed_git pointer.
This can lead to problems in, for e.g., the pack-bitmap code, which does
something like the following (in pack-bitmap.c:open_pack_bitmap()):
struct bitmap_index *bitmap_git = ...;
for (p = get_all_packs(r); p; p = p->next) {
if (open_pack_bitmap_1(bitmap_git, p) == 0)
ret = 0;
}
which is a problem if two copies of the same pack exist in the
packed_git list because pack-bitmap.c:open_pack_bitmap_1() contains a
conditional like the following:
if (bitmap_git->pack || bitmap_git->midx) {
/* ignore extra bitmap file; we can only handle one */
warning("ignoring extra bitmap file: %s", packfile->pack_name);
close(fd);
return -1;
}
Avoid this scenario by not letting write_midx_internal() open a MIDX
that isn't also pointed at by the object store. So long as this is the
case, other routines should prefer to open MIDXs with
get_multi_pack_index() or reprepare_packed_git() instead of creating
instances on their own. Because get_multi_pack_index() returns
`r->object_store->multi_pack_index` if it is non-NULL, we'll only have
one instance of a MIDX open at one time, avoiding these problems.
To encourage this, drop the `struct multi_pack_index *` parameter from
`write_midx_internal()`, and rely instead on the `object_dir` to find
(or initialize) the correct MIDX instance.
Likewise, replace the call to `close_midx()` with
`close_object_store()`, since we're about to replace the MIDX with a new
one and should invalidate the object store's memory of any MIDX that
might have existed beforehand.
Note that this now forbids passing object directories that don't belong
to alternate repositories over `--object-dir`, since before we would
have happily opened a MIDX in any directory, but now restrict ourselves
to only those reachable by `r->objects->multi_pack_index` (and alternate
MIDXs that we can see by walking the `next` pointer).
As far as I can tell, supporting arbitrary directories with
`--object-dir` was a historical accident, since even the documentation
says `<alt>` when referring to the value passed to this option.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-02 04:34:01 +08:00
|
|
|
for (cur = get_multi_pack_index(the_repository); cur; cur = cur->next) {
|
|
|
|
if (!strcmp(object_dir, cur->object_dir)) {
|
|
|
|
ctx.m = cur;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-02-18 22:07:26 +08:00
|
|
|
|
midx: don't reuse corrupt MIDXs when writing
When writing a new multi-pack index, Git tries to reuse as much of the
data from an existing MIDX as possible, like object offsets. This is
done to avoid re-opening a bunch of *.idx files unnecessarily, but can
lead to problems if the data we are reusing is corrupt.
That's because we'll blindly reuse data from an existing MIDX without
checking its trailing checksum for validity. So if there is memory
corruption while writing a MIDX, or disk corruption in the intervening
period between writing and reuse, we'll blindly propagate those bad
values forward.
Suppose we experience a memory corruption while writing a MIDX such that
we write an incorrect object offset (or alternatively, the disk corrupts
the data after being written, but before being reused). Then when we go
to write a new MIDX, we'll reuse the bad object offset without checking
its validity. This means that the MIDX we just wrote is broken, but its
trailing checksum is in-tact, since we never bothered to look at the
values before writing.
In the above, a "git multi-pack-index verify" would have caught the
problem before writing, but writing a new MIDX wouldn't have noticed
anything wrong, blindly carrying forward the corrupt offset.
Individual pack indexes check their validity by verifying the crc32
attached to each entry when carrying data forward during a repack.
We could solve this problem for MIDXs in the same way, but individual
crc32's don't make much sense, since their entries are so small.
Likewise, checking the whole file on every read may be prohibitively
expensive if a repository has a lot of objects, packs, or both.
But we can check the trailing checksum when reusing an existing MIDX
when writing a new one. And a corrupt MIDX need not stop us from writing
a new one, since we can just avoid reusing the existing one at all and
pretend as if we are writing a new MIDX from scratch.
Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-06-24 02:39:12 +08:00
|
|
|
if (ctx.m && !midx_checksum_valid(ctx.m)) {
|
|
|
|
warning(_("ignoring existing multi-pack-index; checksum mismatch"));
|
|
|
|
ctx.m = NULL;
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
ctx.nr = 0;
|
|
|
|
ctx.alloc = ctx.m ? ctx.m->num_packs : 16;
|
|
|
|
ctx.info = NULL;
|
|
|
|
ALLOC_ARRAY(ctx.info, ctx.alloc);
|
|
|
|
|
|
|
|
if (ctx.m) {
|
|
|
|
for (i = 0; i < ctx.m->num_packs; i++) {
|
|
|
|
ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc);
|
|
|
|
|
|
|
|
ctx.info[ctx.nr].orig_pack_int_id = i;
|
|
|
|
ctx.info[ctx.nr].pack_name = xstrdup(ctx.m->pack_names[i]);
|
|
|
|
ctx.info[ctx.nr].p = NULL;
|
|
|
|
ctx.info[ctx.nr].expired = 0;
|
2021-09-01 04:52:02 +08:00
|
|
|
|
|
|
|
if (flags & MIDX_WRITE_REV_INDEX) {
|
|
|
|
/*
|
|
|
|
* If generating a reverse index, need to have
|
|
|
|
* packed_git's loaded to compare their
|
|
|
|
* mtimes and object count.
|
|
|
|
*/
|
|
|
|
if (prepare_midx_pack(the_repository, ctx.m, i)) {
|
|
|
|
error(_("could not load pack"));
|
|
|
|
result = 1;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (open_pack_index(ctx.m->packs[i]))
|
|
|
|
die(_("could not open index for %s"),
|
|
|
|
ctx.m->packs[i]->pack_name);
|
|
|
|
ctx.info[ctx.nr].p = ctx.m->packs[i];
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
ctx.nr++;
|
2018-07-13 03:39:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
ctx.pack_paths_checked = 0;
|
2019-10-22 02:39:59 +08:00
|
|
|
if (flags & MIDX_PROGRESS)
|
2021-02-18 22:07:26 +08:00
|
|
|
ctx.progress = start_delayed_progress(_("Adding packfiles to multi-pack-index"), 0);
|
2019-10-22 02:39:59 +08:00
|
|
|
else
|
2021-02-18 22:07:26 +08:00
|
|
|
ctx.progress = NULL;
|
2019-10-22 02:39:59 +08:00
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
for_each_file_in_pack_dir(object_dir, add_pack_to_midx, &ctx);
|
|
|
|
stop_progress(&ctx.progress);
|
2018-07-13 03:39:26 +08:00
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop)
|
2018-07-13 03:39:36 +08:00
|
|
|
goto cleanup;
|
|
|
|
|
2021-03-30 23:04:11 +08:00
|
|
|
if (preferred_pack_name) {
|
midx: infer preferred pack when not given one
In 9218c6a40c (midx: allow marking a pack as preferred, 2021-03-30), the
multi-pack index code learned how to select a pack which all duplicate
objects are selected from. That is, if an object appears in multiple
packs, select the copy in the preferred pack before breaking ties
according to the other rules like pack mtime and readdir() order.
Not specifying a preferred pack can cause serious problems with
multi-pack reachability bitmaps, because these bitmaps rely on having at
least one pack from which all duplicates are selected. Not having such a
pack causes problems with the code in pack-objects to reuse packs
verbatim (e.g., that code assumes that a delta object in a chunk of pack
sent verbatim will have its base object sent from the same pack).
So why does not marking a pack preferred cause problems here? The reason
is roughly as follows:
- Ties are broken (when handling duplicate objects) by sorting
according to midx_oid_compare(), which sorts objects by OID,
preferred-ness, pack mtime, and finally pack ID (more on that
later).
- The psuedo pack-order (described in
Documentation/technical/pack-format.txt under the section
"multi-pack-index reverse indexes") is computed by
midx_pack_order(), and sorts by pack ID and pack offset, with
preferred packs sorting first.
- But! Pack IDs come from incrementing the pack count in
add_pack_to_midx(), which is a callback to
for_each_file_in_pack_dir(), meaning that pack IDs are assigned in
readdir() order.
When specifying a preferred pack, all of that works fine, because
duplicate objects are correctly resolved in favor of the copy in the
preferred pack, and the preferred pack sorts first in the object order.
"Sorting first" is critical, because the bitmap code relies on finding
out which pack holds the first object in the MIDX's pseudo pack-order to
determine which pack is preferred.
But if we didn't specify a preferred pack, and the pack which comes
first in readdir() order does not also have the lowest timestamp, then
it's possible that that pack (the one that sorts first in pseudo-pack
order, which the bitmap code will treat as the preferred one) did *not*
have all duplicate objects resolved in its favor, resulting in breakage.
The fix is simple: pick a (semi-arbitrary, non-empty) preferred pack
when none was specified. This forces that pack to have duplicates
resolved in its favor, and (critically) to sort first in pseudo-pack
order. Unfortunately, testing this behavior portably isn't possible,
since it depends on readdir() order which isn't guaranteed by POSIX.
(Note that multi-pack reachability bitmaps have yet to be implemented;
so in that sense this patch is fixing a bug which does not yet exist.
But by having this patch beforehand, we can prevent the bug from ever
materializing.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-01 04:52:04 +08:00
|
|
|
int found = 0;
|
2021-03-30 23:04:11 +08:00
|
|
|
for (i = 0; i < ctx.nr; i++) {
|
|
|
|
if (!cmp_idx_or_pack_name(preferred_pack_name,
|
|
|
|
ctx.info[i].pack_name)) {
|
|
|
|
ctx.preferred_pack_idx = i;
|
midx: infer preferred pack when not given one
In 9218c6a40c (midx: allow marking a pack as preferred, 2021-03-30), the
multi-pack index code learned how to select a pack which all duplicate
objects are selected from. That is, if an object appears in multiple
packs, select the copy in the preferred pack before breaking ties
according to the other rules like pack mtime and readdir() order.
Not specifying a preferred pack can cause serious problems with
multi-pack reachability bitmaps, because these bitmaps rely on having at
least one pack from which all duplicates are selected. Not having such a
pack causes problems with the code in pack-objects to reuse packs
verbatim (e.g., that code assumes that a delta object in a chunk of pack
sent verbatim will have its base object sent from the same pack).
So why does not marking a pack preferred cause problems here? The reason
is roughly as follows:
- Ties are broken (when handling duplicate objects) by sorting
according to midx_oid_compare(), which sorts objects by OID,
preferred-ness, pack mtime, and finally pack ID (more on that
later).
- The psuedo pack-order (described in
Documentation/technical/pack-format.txt under the section
"multi-pack-index reverse indexes") is computed by
midx_pack_order(), and sorts by pack ID and pack offset, with
preferred packs sorting first.
- But! Pack IDs come from incrementing the pack count in
add_pack_to_midx(), which is a callback to
for_each_file_in_pack_dir(), meaning that pack IDs are assigned in
readdir() order.
When specifying a preferred pack, all of that works fine, because
duplicate objects are correctly resolved in favor of the copy in the
preferred pack, and the preferred pack sorts first in the object order.
"Sorting first" is critical, because the bitmap code relies on finding
out which pack holds the first object in the MIDX's pseudo pack-order to
determine which pack is preferred.
But if we didn't specify a preferred pack, and the pack which comes
first in readdir() order does not also have the lowest timestamp, then
it's possible that that pack (the one that sorts first in pseudo-pack
order, which the bitmap code will treat as the preferred one) did *not*
have all duplicate objects resolved in its favor, resulting in breakage.
The fix is simple: pick a (semi-arbitrary, non-empty) preferred pack
when none was specified. This forces that pack to have duplicates
resolved in its favor, and (critically) to sort first in pseudo-pack
order. Unfortunately, testing this behavior portably isn't possible,
since it depends on readdir() order which isn't guaranteed by POSIX.
(Note that multi-pack reachability bitmaps have yet to be implemented;
so in that sense this patch is fixing a bug which does not yet exist.
But by having this patch beforehand, we can prevent the bug from ever
materializing.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-01 04:52:04 +08:00
|
|
|
found = 1;
|
2021-03-30 23:04:11 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
midx: infer preferred pack when not given one
In 9218c6a40c (midx: allow marking a pack as preferred, 2021-03-30), the
multi-pack index code learned how to select a pack which all duplicate
objects are selected from. That is, if an object appears in multiple
packs, select the copy in the preferred pack before breaking ties
according to the other rules like pack mtime and readdir() order.
Not specifying a preferred pack can cause serious problems with
multi-pack reachability bitmaps, because these bitmaps rely on having at
least one pack from which all duplicates are selected. Not having such a
pack causes problems with the code in pack-objects to reuse packs
verbatim (e.g., that code assumes that a delta object in a chunk of pack
sent verbatim will have its base object sent from the same pack).
So why does not marking a pack preferred cause problems here? The reason
is roughly as follows:
- Ties are broken (when handling duplicate objects) by sorting
according to midx_oid_compare(), which sorts objects by OID,
preferred-ness, pack mtime, and finally pack ID (more on that
later).
- The psuedo pack-order (described in
Documentation/technical/pack-format.txt under the section
"multi-pack-index reverse indexes") is computed by
midx_pack_order(), and sorts by pack ID and pack offset, with
preferred packs sorting first.
- But! Pack IDs come from incrementing the pack count in
add_pack_to_midx(), which is a callback to
for_each_file_in_pack_dir(), meaning that pack IDs are assigned in
readdir() order.
When specifying a preferred pack, all of that works fine, because
duplicate objects are correctly resolved in favor of the copy in the
preferred pack, and the preferred pack sorts first in the object order.
"Sorting first" is critical, because the bitmap code relies on finding
out which pack holds the first object in the MIDX's pseudo pack-order to
determine which pack is preferred.
But if we didn't specify a preferred pack, and the pack which comes
first in readdir() order does not also have the lowest timestamp, then
it's possible that that pack (the one that sorts first in pseudo-pack
order, which the bitmap code will treat as the preferred one) did *not*
have all duplicate objects resolved in its favor, resulting in breakage.
The fix is simple: pick a (semi-arbitrary, non-empty) preferred pack
when none was specified. This forces that pack to have duplicates
resolved in its favor, and (critically) to sort first in pseudo-pack
order. Unfortunately, testing this behavior portably isn't possible,
since it depends on readdir() order which isn't guaranteed by POSIX.
(Note that multi-pack reachability bitmaps have yet to be implemented;
so in that sense this patch is fixing a bug which does not yet exist.
But by having this patch beforehand, we can prevent the bug from ever
materializing.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-01 04:52:04 +08:00
|
|
|
|
|
|
|
if (!found)
|
|
|
|
warning(_("unknown preferred pack: '%s'"),
|
|
|
|
preferred_pack_name);
|
|
|
|
} else if (ctx.nr && (flags & MIDX_WRITE_REV_INDEX)) {
|
|
|
|
struct packed_git *oldest = ctx.info[ctx.preferred_pack_idx].p;
|
|
|
|
ctx.preferred_pack_idx = 0;
|
|
|
|
|
|
|
|
if (packs_to_drop && packs_to_drop->nr)
|
|
|
|
BUG("cannot write a MIDX bitmap during expiration");
|
|
|
|
|
|
|
|
/*
|
|
|
|
* set a preferred pack when writing a bitmap to ensure that
|
|
|
|
* the pack from which the first object is selected in pseudo
|
|
|
|
* pack-order has all of its objects selected from that pack
|
|
|
|
* (and not another pack containing a duplicate)
|
|
|
|
*/
|
|
|
|
for (i = 1; i < ctx.nr; i++) {
|
|
|
|
struct packed_git *p = ctx.info[i].p;
|
|
|
|
|
|
|
|
if (!oldest->num_objects || p->mtime < oldest->mtime) {
|
|
|
|
oldest = p;
|
|
|
|
ctx.preferred_pack_idx = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!oldest->num_objects) {
|
|
|
|
/*
|
|
|
|
* If all packs are empty; unset the preferred index.
|
|
|
|
* This is acceptable since there will be no duplicate
|
|
|
|
* objects to resolve, so the preferred value doesn't
|
|
|
|
* matter.
|
|
|
|
*/
|
|
|
|
ctx.preferred_pack_idx = -1;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* otherwise don't mark any pack as preferred to avoid
|
|
|
|
* interfering with expiration logic below
|
|
|
|
*/
|
|
|
|
ctx.preferred_pack_idx = -1;
|
2021-03-30 23:04:11 +08:00
|
|
|
}
|
|
|
|
|
2021-09-01 04:52:02 +08:00
|
|
|
if (ctx.preferred_pack_idx > -1) {
|
|
|
|
struct packed_git *preferred = ctx.info[ctx.preferred_pack_idx].p;
|
|
|
|
if (!preferred->num_objects) {
|
|
|
|
error(_("cannot select preferred pack %s with no objects"),
|
|
|
|
preferred->pack_name);
|
|
|
|
result = 1;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-30 23:04:11 +08:00
|
|
|
ctx.entries = get_sorted_entries(ctx.m, ctx.info, ctx.nr, &ctx.entries_nr,
|
|
|
|
ctx.preferred_pack_idx);
|
2018-07-13 03:39:36 +08:00
|
|
|
|
2021-02-18 22:07:29 +08:00
|
|
|
ctx.large_offsets_needed = 0;
|
2021-02-18 22:07:28 +08:00
|
|
|
for (i = 0; i < ctx.entries_nr; i++) {
|
|
|
|
if (ctx.entries[i].offset > 0x7fffffff)
|
2021-02-18 22:07:30 +08:00
|
|
|
ctx.num_large_offsets++;
|
2021-02-18 22:07:28 +08:00
|
|
|
if (ctx.entries[i].offset > 0xffffffff)
|
2021-02-18 22:07:29 +08:00
|
|
|
ctx.large_offsets_needed = 1;
|
2018-07-13 03:39:32 +08:00
|
|
|
}
|
2018-07-13 03:39:29 +08:00
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
QSORT(ctx.info, ctx.nr, pack_info_compare);
|
2019-06-11 07:35:24 +08:00
|
|
|
|
2019-06-11 07:35:25 +08:00
|
|
|
if (packs_to_drop && packs_to_drop->nr) {
|
|
|
|
int drop_index = 0;
|
|
|
|
int missing_drops = 0;
|
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
for (i = 0; i < ctx.nr && drop_index < packs_to_drop->nr; i++) {
|
|
|
|
int cmp = strcmp(ctx.info[i].pack_name,
|
2019-06-11 07:35:25 +08:00
|
|
|
packs_to_drop->items[drop_index].string);
|
|
|
|
|
|
|
|
if (!cmp) {
|
|
|
|
drop_index++;
|
2021-02-18 22:07:26 +08:00
|
|
|
ctx.info[i].expired = 1;
|
2019-06-11 07:35:25 +08:00
|
|
|
} else if (cmp > 0) {
|
|
|
|
error(_("did not see pack-file %s to drop"),
|
|
|
|
packs_to_drop->items[drop_index].string);
|
|
|
|
drop_index++;
|
|
|
|
missing_drops++;
|
|
|
|
i--;
|
|
|
|
} else {
|
2021-02-18 22:07:26 +08:00
|
|
|
ctx.info[i].expired = 0;
|
2019-06-11 07:35:25 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (missing_drops) {
|
|
|
|
result = 1;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-11 07:35:24 +08:00
|
|
|
/*
|
|
|
|
* pack_perm stores a permutation between pack-int-ids from the
|
|
|
|
* previous multi-pack-index to the new one we are writing:
|
|
|
|
*
|
|
|
|
* pack_perm[old_id] = new_id
|
|
|
|
*/
|
2021-02-18 22:07:29 +08:00
|
|
|
ALLOC_ARRAY(ctx.pack_perm, ctx.nr);
|
2021-02-18 22:07:26 +08:00
|
|
|
for (i = 0; i < ctx.nr; i++) {
|
|
|
|
if (ctx.info[i].expired) {
|
2019-06-11 07:35:25 +08:00
|
|
|
dropped_packs++;
|
2021-02-18 22:07:29 +08:00
|
|
|
ctx.pack_perm[ctx.info[i].orig_pack_int_id] = PACK_EXPIRED;
|
2019-06-11 07:35:25 +08:00
|
|
|
} else {
|
2021-02-18 22:07:29 +08:00
|
|
|
ctx.pack_perm[ctx.info[i].orig_pack_int_id] = i - dropped_packs;
|
2019-06-11 07:35:25 +08:00
|
|
|
}
|
2019-06-11 07:35:24 +08:00
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
for (i = 0; i < ctx.nr; i++) {
|
|
|
|
if (!ctx.info[i].expired)
|
|
|
|
pack_name_concat_len += strlen(ctx.info[i].pack_name) + 1;
|
2019-06-11 07:35:25 +08:00
|
|
|
}
|
2019-06-11 07:35:24 +08:00
|
|
|
|
2021-03-30 23:04:11 +08:00
|
|
|
/* Check that the preferred pack wasn't expired (if given). */
|
|
|
|
if (preferred_pack_name) {
|
|
|
|
struct pack_info *preferred = bsearch(preferred_pack_name,
|
|
|
|
ctx.info, ctx.nr,
|
|
|
|
sizeof(*ctx.info),
|
|
|
|
idx_or_pack_name_cmp);
|
midx: infer preferred pack when not given one
In 9218c6a40c (midx: allow marking a pack as preferred, 2021-03-30), the
multi-pack index code learned how to select a pack which all duplicate
objects are selected from. That is, if an object appears in multiple
packs, select the copy in the preferred pack before breaking ties
according to the other rules like pack mtime and readdir() order.
Not specifying a preferred pack can cause serious problems with
multi-pack reachability bitmaps, because these bitmaps rely on having at
least one pack from which all duplicates are selected. Not having such a
pack causes problems with the code in pack-objects to reuse packs
verbatim (e.g., that code assumes that a delta object in a chunk of pack
sent verbatim will have its base object sent from the same pack).
So why does not marking a pack preferred cause problems here? The reason
is roughly as follows:
- Ties are broken (when handling duplicate objects) by sorting
according to midx_oid_compare(), which sorts objects by OID,
preferred-ness, pack mtime, and finally pack ID (more on that
later).
- The psuedo pack-order (described in
Documentation/technical/pack-format.txt under the section
"multi-pack-index reverse indexes") is computed by
midx_pack_order(), and sorts by pack ID and pack offset, with
preferred packs sorting first.
- But! Pack IDs come from incrementing the pack count in
add_pack_to_midx(), which is a callback to
for_each_file_in_pack_dir(), meaning that pack IDs are assigned in
readdir() order.
When specifying a preferred pack, all of that works fine, because
duplicate objects are correctly resolved in favor of the copy in the
preferred pack, and the preferred pack sorts first in the object order.
"Sorting first" is critical, because the bitmap code relies on finding
out which pack holds the first object in the MIDX's pseudo pack-order to
determine which pack is preferred.
But if we didn't specify a preferred pack, and the pack which comes
first in readdir() order does not also have the lowest timestamp, then
it's possible that that pack (the one that sorts first in pseudo-pack
order, which the bitmap code will treat as the preferred one) did *not*
have all duplicate objects resolved in its favor, resulting in breakage.
The fix is simple: pick a (semi-arbitrary, non-empty) preferred pack
when none was specified. This forces that pack to have duplicates
resolved in its favor, and (critically) to sort first in pseudo-pack
order. Unfortunately, testing this behavior portably isn't possible,
since it depends on readdir() order which isn't guaranteed by POSIX.
(Note that multi-pack reachability bitmaps have yet to be implemented;
so in that sense this patch is fixing a bug which does not yet exist.
But by having this patch beforehand, we can prevent the bug from ever
materializing.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-01 04:52:04 +08:00
|
|
|
if (preferred) {
|
2021-03-30 23:04:11 +08:00
|
|
|
uint32_t perm = ctx.pack_perm[preferred->orig_pack_int_id];
|
|
|
|
if (perm == PACK_EXPIRED)
|
|
|
|
warning(_("preferred pack '%s' is expired"),
|
|
|
|
preferred_pack_name);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-11 07:35:24 +08:00
|
|
|
if (pack_name_concat_len % MIDX_CHUNK_ALIGNMENT)
|
|
|
|
pack_name_concat_len += MIDX_CHUNK_ALIGNMENT -
|
|
|
|
(pack_name_concat_len % MIDX_CHUNK_ALIGNMENT);
|
|
|
|
|
2018-07-13 03:39:22 +08:00
|
|
|
hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
|
2021-01-06 03:23:48 +08:00
|
|
|
f = hashfd(get_lock_file_fd(&lk), get_lock_file_path(&lk));
|
2018-07-13 03:39:22 +08:00
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
if (ctx.m)
|
midx: avoid opening multiple MIDXs when writing
Opening multiple instance of the same MIDX can lead to problems like two
separate packed_git structures which represent the same pack being added
to the repository's object store.
The above scenario can happen because prepare_midx_pack() checks if
`m->packs[pack_int_id]` is NULL in order to determine if a pack has been
opened and installed in the repository before. But a caller can
construct two copies of the same MIDX by calling get_multi_pack_index()
and load_multi_pack_index() since the former manipulates the
object store directly but the latter is a lower-level routine which
allocates a new MIDX for each call.
So if prepare_midx_pack() is called on multiple MIDXs with the same
pack_int_id, then that pack will be installed twice in the object
store's packed_git pointer.
This can lead to problems in, for e.g., the pack-bitmap code, which does
something like the following (in pack-bitmap.c:open_pack_bitmap()):
struct bitmap_index *bitmap_git = ...;
for (p = get_all_packs(r); p; p = p->next) {
if (open_pack_bitmap_1(bitmap_git, p) == 0)
ret = 0;
}
which is a problem if two copies of the same pack exist in the
packed_git list because pack-bitmap.c:open_pack_bitmap_1() contains a
conditional like the following:
if (bitmap_git->pack || bitmap_git->midx) {
/* ignore extra bitmap file; we can only handle one */
warning("ignoring extra bitmap file: %s", packfile->pack_name);
close(fd);
return -1;
}
Avoid this scenario by not letting write_midx_internal() open a MIDX
that isn't also pointed at by the object store. So long as this is the
case, other routines should prefer to open MIDXs with
get_multi_pack_index() or reprepare_packed_git() instead of creating
instances on their own. Because get_multi_pack_index() returns
`r->object_store->multi_pack_index` if it is non-NULL, we'll only have
one instance of a MIDX open at one time, avoiding these problems.
To encourage this, drop the `struct multi_pack_index *` parameter from
`write_midx_internal()`, and rely instead on the `object_dir` to find
(or initialize) the correct MIDX instance.
Likewise, replace the call to `close_midx()` with
`close_object_store()`, since we're about to replace the MIDX with a new
one and should invalidate the object store's memory of any MIDX that
might have existed beforehand.
Note that this now forbids passing object directories that don't belong
to alternate repositories over `--object-dir`, since before we would
have happily opened a MIDX in any directory, but now restrict ourselves
to only those reachable by `r->objects->multi_pack_index` (and alternate
MIDXs that we can see by walking the `next` pointer).
As far as I can tell, supporting arbitrary directories with
`--object-dir` was a historical accident, since even the documentation
says `<alt>` when referring to the value passed to this option.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-02 04:34:01 +08:00
|
|
|
close_object_store(the_repository->objects);
|
2018-07-13 03:39:36 +08:00
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
if (ctx.nr - dropped_packs == 0) {
|
2020-03-29 06:18:22 +08:00
|
|
|
error(_("no pack files to index."));
|
|
|
|
result = 1;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:33 +08:00
|
|
|
cf = init_chunkfile(f);
|
2018-07-13 03:39:27 +08:00
|
|
|
|
2021-02-18 22:07:33 +08:00
|
|
|
add_chunk(cf, MIDX_CHUNKID_PACKNAMES, pack_name_concat_len,
|
|
|
|
write_midx_pack_names);
|
|
|
|
add_chunk(cf, MIDX_CHUNKID_OIDFANOUT, MIDX_CHUNK_FANOUT_SIZE,
|
|
|
|
write_midx_oid_fanout);
|
|
|
|
add_chunk(cf, MIDX_CHUNKID_OIDLOOKUP,
|
2021-02-18 22:07:37 +08:00
|
|
|
(size_t)ctx.entries_nr * the_hash_algo->rawsz,
|
2021-02-18 22:07:33 +08:00
|
|
|
write_midx_oid_lookup);
|
|
|
|
add_chunk(cf, MIDX_CHUNKID_OBJECTOFFSETS,
|
2021-02-18 22:07:37 +08:00
|
|
|
(size_t)ctx.entries_nr * MIDX_CHUNK_OFFSET_WIDTH,
|
2021-02-18 22:07:33 +08:00
|
|
|
write_midx_object_offsets);
|
2018-07-13 03:39:27 +08:00
|
|
|
|
2021-02-18 22:07:33 +08:00
|
|
|
if (ctx.large_offsets_needed)
|
|
|
|
add_chunk(cf, MIDX_CHUNKID_LARGEOFFSETS,
|
2021-02-18 22:07:37 +08:00
|
|
|
(size_t)ctx.num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH,
|
2021-02-18 22:07:33 +08:00
|
|
|
write_midx_large_offsets);
|
2018-07-13 03:39:27 +08:00
|
|
|
|
2021-02-18 22:07:33 +08:00
|
|
|
write_midx_header(f, get_num_chunks(cf), ctx.nr - dropped_packs);
|
|
|
|
write_chunkfile(cf, &ctx);
|
2018-07-13 03:39:22 +08:00
|
|
|
|
2021-03-30 23:04:17 +08:00
|
|
|
finalize_hashfile(f, midx_hash, CSUM_FSYNC | CSUM_HASH_IN_STREAM);
|
2021-02-18 22:07:33 +08:00
|
|
|
free_chunkfile(cf);
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
|
|
|
|
if (flags & MIDX_WRITE_REV_INDEX)
|
|
|
|
ctx.pack_order = midx_pack_order(&ctx);
|
|
|
|
|
|
|
|
if (flags & MIDX_WRITE_REV_INDEX)
|
|
|
|
write_midx_reverse_index(midx_name, midx_hash, &ctx);
|
|
|
|
|
2018-07-13 03:39:22 +08:00
|
|
|
commit_lock_file(&lk);
|
|
|
|
|
2021-09-01 04:51:59 +08:00
|
|
|
clear_midx_files_ext(object_dir, ".rev", midx_hash);
|
|
|
|
|
2018-07-13 03:39:36 +08:00
|
|
|
cleanup:
|
2021-02-18 22:07:26 +08:00
|
|
|
for (i = 0; i < ctx.nr; i++) {
|
|
|
|
if (ctx.info[i].p) {
|
|
|
|
close_pack(ctx.info[i].p);
|
|
|
|
free(ctx.info[i].p);
|
2018-07-13 03:39:26 +08:00
|
|
|
}
|
2021-02-18 22:07:26 +08:00
|
|
|
free(ctx.info[i].pack_name);
|
2018-07-13 03:39:26 +08:00
|
|
|
}
|
|
|
|
|
2021-02-18 22:07:26 +08:00
|
|
|
free(ctx.info);
|
2021-02-18 22:07:28 +08:00
|
|
|
free(ctx.entries);
|
2021-02-18 22:07:29 +08:00
|
|
|
free(ctx.pack_perm);
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
free(ctx.pack_order);
|
2018-07-13 03:39:36 +08:00
|
|
|
free(midx_name);
|
2019-06-11 07:35:25 +08:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2021-03-30 23:04:11 +08:00
|
|
|
int write_midx_file(const char *object_dir,
|
|
|
|
const char *preferred_pack_name,
|
|
|
|
unsigned flags)
|
2019-06-11 07:35:25 +08:00
|
|
|
{
|
midx: avoid opening multiple MIDXs when writing
Opening multiple instance of the same MIDX can lead to problems like two
separate packed_git structures which represent the same pack being added
to the repository's object store.
The above scenario can happen because prepare_midx_pack() checks if
`m->packs[pack_int_id]` is NULL in order to determine if a pack has been
opened and installed in the repository before. But a caller can
construct two copies of the same MIDX by calling get_multi_pack_index()
and load_multi_pack_index() since the former manipulates the
object store directly but the latter is a lower-level routine which
allocates a new MIDX for each call.
So if prepare_midx_pack() is called on multiple MIDXs with the same
pack_int_id, then that pack will be installed twice in the object
store's packed_git pointer.
This can lead to problems in, for e.g., the pack-bitmap code, which does
something like the following (in pack-bitmap.c:open_pack_bitmap()):
struct bitmap_index *bitmap_git = ...;
for (p = get_all_packs(r); p; p = p->next) {
if (open_pack_bitmap_1(bitmap_git, p) == 0)
ret = 0;
}
which is a problem if two copies of the same pack exist in the
packed_git list because pack-bitmap.c:open_pack_bitmap_1() contains a
conditional like the following:
if (bitmap_git->pack || bitmap_git->midx) {
/* ignore extra bitmap file; we can only handle one */
warning("ignoring extra bitmap file: %s", packfile->pack_name);
close(fd);
return -1;
}
Avoid this scenario by not letting write_midx_internal() open a MIDX
that isn't also pointed at by the object store. So long as this is the
case, other routines should prefer to open MIDXs with
get_multi_pack_index() or reprepare_packed_git() instead of creating
instances on their own. Because get_multi_pack_index() returns
`r->object_store->multi_pack_index` if it is non-NULL, we'll only have
one instance of a MIDX open at one time, avoiding these problems.
To encourage this, drop the `struct multi_pack_index *` parameter from
`write_midx_internal()`, and rely instead on the `object_dir` to find
(or initialize) the correct MIDX instance.
Likewise, replace the call to `close_midx()` with
`close_object_store()`, since we're about to replace the MIDX with a new
one and should invalidate the object store's memory of any MIDX that
might have existed beforehand.
Note that this now forbids passing object directories that don't belong
to alternate repositories over `--object-dir`, since before we would
have happily opened a MIDX in any directory, but now restrict ourselves
to only those reachable by `r->objects->multi_pack_index` (and alternate
MIDXs that we can see by walking the `next` pointer).
As far as I can tell, supporting arbitrary directories with
`--object-dir` was a historical accident, since even the documentation
says `<alt>` when referring to the value passed to this option.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-02 04:34:01 +08:00
|
|
|
return write_midx_internal(object_dir, NULL, preferred_pack_name, flags);
|
2018-07-13 03:39:21 +08:00
|
|
|
}
|
2018-07-13 03:39:40 +08:00
|
|
|
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
struct clear_midx_data {
|
|
|
|
char *keep;
|
|
|
|
const char *ext;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void clear_midx_file_ext(const char *full_path, size_t full_path_len,
|
|
|
|
const char *file_name, void *_data)
|
|
|
|
{
|
|
|
|
struct clear_midx_data *data = _data;
|
|
|
|
|
|
|
|
if (!(starts_with(file_name, "multi-pack-index-") &&
|
|
|
|
ends_with(file_name, data->ext)))
|
|
|
|
return;
|
|
|
|
if (data->keep && !strcmp(data->keep, file_name))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (unlink(full_path))
|
|
|
|
die_errno(_("failed to remove %s"), full_path);
|
|
|
|
}
|
|
|
|
|
2021-09-01 04:51:55 +08:00
|
|
|
static void clear_midx_files_ext(const char *object_dir, const char *ext,
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
unsigned char *keep_hash)
|
|
|
|
{
|
|
|
|
struct clear_midx_data data;
|
|
|
|
memset(&data, 0, sizeof(struct clear_midx_data));
|
|
|
|
|
|
|
|
if (keep_hash)
|
|
|
|
data.keep = xstrfmt("multi-pack-index-%s%s",
|
|
|
|
hash_to_hex(keep_hash), ext);
|
|
|
|
data.ext = ext;
|
|
|
|
|
2021-09-01 04:51:55 +08:00
|
|
|
for_each_file_in_pack_dir(object_dir,
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
clear_midx_file_ext,
|
|
|
|
&data);
|
|
|
|
|
|
|
|
free(data.keep);
|
2018-07-13 03:39:21 +08:00
|
|
|
}
|
2018-07-13 03:39:40 +08:00
|
|
|
|
2018-10-13 01:34:19 +08:00
|
|
|
void clear_midx_file(struct repository *r)
|
2018-07-13 03:39:40 +08:00
|
|
|
{
|
2019-01-05 05:33:32 +08:00
|
|
|
char *midx = get_midx_filename(r->objects->odb->path);
|
2018-10-13 01:34:19 +08:00
|
|
|
|
|
|
|
if (r->objects && r->objects->multi_pack_index) {
|
|
|
|
close_midx(r->objects->multi_pack_index);
|
|
|
|
r->objects->multi_pack_index = NULL;
|
|
|
|
}
|
2018-07-13 03:39:40 +08:00
|
|
|
|
2020-08-13 23:55:00 +08:00
|
|
|
if (remove_path(midx))
|
2018-07-13 03:39:40 +08:00
|
|
|
die(_("failed to clear multi-pack-index at %s"), midx);
|
|
|
|
|
2021-09-01 04:51:55 +08:00
|
|
|
clear_midx_files_ext(r->objects->odb->path, ".rev", NULL);
|
pack-revindex: write multi-pack reverse indexes
Implement the writing half of multi-pack reverse indexes. This is
nothing more than the format describe a few patches ago, with a new set
of helper functions that will be used to clear out stale .rev files
corresponding to old MIDXs.
Unfortunately, a very similar comparison function as the one implemented
recently in pack-revindex.c is reimplemented here, this time accepting a
MIDX-internal type. An effort to DRY these up would create more
indirection and overhead than is necessary, so it isn't pursued here.
Currently, there are no callers which pass the MIDX_WRITE_REV_INDEX
flag, meaning that this is all dead code. But, that won't be the case
for long, since subsequent patches will introduce the multi-pack bitmap,
which will begin passing this field.
(In midx.c:write_midx_internal(), the two adjacent if statements share a
conditional, but are written separately since the first one will
eventually also handle the MIDX_WRITE_BITMAP flag, which does not yet
exist.)
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-03-30 23:04:32 +08:00
|
|
|
|
2018-07-13 03:39:40 +08:00
|
|
|
free(midx);
|
|
|
|
}
|
2018-09-14 02:02:13 +08:00
|
|
|
|
|
|
|
static int verify_midx_error;
|
|
|
|
|
2021-07-13 16:05:18 +08:00
|
|
|
__attribute__((format (printf, 1, 2)))
|
2018-09-14 02:02:19 +08:00
|
|
|
static void midx_report(const char *fmt, ...)
|
|
|
|
{
|
|
|
|
va_list ap;
|
|
|
|
verify_midx_error = 1;
|
|
|
|
va_start(ap, fmt);
|
|
|
|
vfprintf(stderr, fmt, ap);
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
va_end(ap);
|
|
|
|
}
|
|
|
|
|
2019-03-22 03:36:15 +08:00
|
|
|
struct pair_pos_vs_id
|
|
|
|
{
|
|
|
|
uint32_t pos;
|
|
|
|
uint32_t pack_int_id;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int compare_pair_pos_vs_id(const void *_a, const void *_b)
|
|
|
|
{
|
|
|
|
struct pair_pos_vs_id *a = (struct pair_pos_vs_id *)_a;
|
|
|
|
struct pair_pos_vs_id *b = (struct pair_pos_vs_id *)_b;
|
|
|
|
|
|
|
|
return b->pack_int_id - a->pack_int_id;
|
|
|
|
}
|
|
|
|
|
2019-03-22 03:36:14 +08:00
|
|
|
/*
|
|
|
|
* Limit calls to display_progress() for performance reasons.
|
|
|
|
* The interval here was arbitrarily chosen.
|
|
|
|
*/
|
|
|
|
#define SPARSE_PROGRESS_INTERVAL (1 << 12)
|
|
|
|
#define midx_display_sparse_progress(progress, n) \
|
|
|
|
do { \
|
|
|
|
uint64_t _n = (n); \
|
|
|
|
if ((_n & (SPARSE_PROGRESS_INTERVAL - 1)) == 0) \
|
|
|
|
display_progress(progress, _n); \
|
|
|
|
} while (0)
|
|
|
|
|
2019-10-22 02:39:58 +08:00
|
|
|
int verify_midx_file(struct repository *r, const char *object_dir, unsigned flags)
|
2018-09-14 02:02:13 +08:00
|
|
|
{
|
2019-03-22 03:36:15 +08:00
|
|
|
struct pair_pos_vs_id *pairs = NULL;
|
2018-09-14 02:02:19 +08:00
|
|
|
uint32_t i;
|
2019-10-22 02:40:01 +08:00
|
|
|
struct progress *progress = NULL;
|
2018-09-14 02:02:13 +08:00
|
|
|
struct multi_pack_index *m = load_multi_pack_index(object_dir, 1);
|
|
|
|
verify_midx_error = 0;
|
|
|
|
|
2020-08-17 22:04:48 +08:00
|
|
|
if (!m) {
|
|
|
|
int result = 0;
|
|
|
|
struct stat sb;
|
|
|
|
char *filename = get_midx_filename(object_dir);
|
|
|
|
if (!stat(filename, &sb)) {
|
|
|
|
error(_("multi-pack-index file exists, but failed to parse"));
|
|
|
|
result = 1;
|
|
|
|
}
|
|
|
|
free(filename);
|
|
|
|
return result;
|
|
|
|
}
|
2018-09-14 02:02:13 +08:00
|
|
|
|
2021-06-24 02:39:15 +08:00
|
|
|
if (!midx_checksum_valid(m))
|
|
|
|
midx_report(_("incorrect checksum"));
|
|
|
|
|
2019-10-22 02:40:01 +08:00
|
|
|
if (flags & MIDX_PROGRESS)
|
2020-09-25 20:33:35 +08:00
|
|
|
progress = start_delayed_progress(_("Looking for referenced packfiles"),
|
2019-10-22 02:40:01 +08:00
|
|
|
m->num_packs);
|
2018-09-14 02:02:19 +08:00
|
|
|
for (i = 0; i < m->num_packs; i++) {
|
2019-04-30 00:18:55 +08:00
|
|
|
if (prepare_midx_pack(r, m, i))
|
2018-09-14 02:02:19 +08:00
|
|
|
midx_report("failed to load pack in position %d", i);
|
2019-03-22 03:36:14 +08:00
|
|
|
|
|
|
|
display_progress(progress, i + 1);
|
2018-09-14 02:02:19 +08:00
|
|
|
}
|
2019-03-22 03:36:14 +08:00
|
|
|
stop_progress(&progress);
|
2018-09-14 02:02:19 +08:00
|
|
|
|
2018-09-14 02:02:20 +08:00
|
|
|
for (i = 0; i < 255; i++) {
|
|
|
|
uint32_t oid_fanout1 = ntohl(m->chunk_oid_fanout[i]);
|
|
|
|
uint32_t oid_fanout2 = ntohl(m->chunk_oid_fanout[i + 1]);
|
|
|
|
|
|
|
|
if (oid_fanout1 > oid_fanout2)
|
|
|
|
midx_report(_("oid fanout out of order: fanout[%d] = %"PRIx32" > %"PRIx32" = fanout[%d]"),
|
|
|
|
i, oid_fanout1, oid_fanout2, i + 1);
|
|
|
|
}
|
|
|
|
|
2020-03-29 06:18:22 +08:00
|
|
|
if (m->num_objects == 0) {
|
|
|
|
midx_report(_("the midx contains no oid"));
|
|
|
|
/*
|
|
|
|
* Remaining tests assume that we have objects, so we can
|
|
|
|
* return here.
|
|
|
|
*/
|
|
|
|
return verify_midx_error;
|
|
|
|
}
|
|
|
|
|
2019-10-22 02:40:01 +08:00
|
|
|
if (flags & MIDX_PROGRESS)
|
|
|
|
progress = start_sparse_progress(_("Verifying OID order in multi-pack-index"),
|
|
|
|
m->num_objects - 1);
|
2018-09-14 02:02:22 +08:00
|
|
|
for (i = 0; i < m->num_objects - 1; i++) {
|
|
|
|
struct object_id oid1, oid2;
|
|
|
|
|
|
|
|
nth_midxed_object_oid(&oid1, m, i);
|
|
|
|
nth_midxed_object_oid(&oid2, m, i + 1);
|
|
|
|
|
|
|
|
if (oidcmp(&oid1, &oid2) >= 0)
|
|
|
|
midx_report(_("oid lookup out of order: oid[%d] = %s >= %s = oid[%d]"),
|
|
|
|
i, oid_to_hex(&oid1), oid_to_hex(&oid2), i + 1);
|
2019-03-22 03:36:14 +08:00
|
|
|
|
|
|
|
midx_display_sparse_progress(progress, i + 1);
|
2018-09-14 02:02:22 +08:00
|
|
|
}
|
2019-03-22 03:36:14 +08:00
|
|
|
stop_progress(&progress);
|
2018-09-14 02:02:22 +08:00
|
|
|
|
2019-03-22 03:36:15 +08:00
|
|
|
/*
|
|
|
|
* Create an array mapping each object to its packfile id. Sort it
|
|
|
|
* to group the objects by packfile. Use this permutation to visit
|
|
|
|
* each of the objects and only require 1 packfile to be open at a
|
|
|
|
* time.
|
|
|
|
*/
|
|
|
|
ALLOC_ARRAY(pairs, m->num_objects);
|
|
|
|
for (i = 0; i < m->num_objects; i++) {
|
|
|
|
pairs[i].pos = i;
|
|
|
|
pairs[i].pack_int_id = nth_midxed_pack_int_id(m, i);
|
|
|
|
}
|
|
|
|
|
2019-10-22 02:40:01 +08:00
|
|
|
if (flags & MIDX_PROGRESS)
|
|
|
|
progress = start_sparse_progress(_("Sorting objects by packfile"),
|
|
|
|
m->num_objects);
|
2019-03-22 03:36:15 +08:00
|
|
|
display_progress(progress, 0); /* TODO: Measure QSORT() progress */
|
|
|
|
QSORT(pairs, m->num_objects, compare_pair_pos_vs_id);
|
|
|
|
stop_progress(&progress);
|
|
|
|
|
2019-10-22 02:40:01 +08:00
|
|
|
if (flags & MIDX_PROGRESS)
|
|
|
|
progress = start_sparse_progress(_("Verifying object offsets"), m->num_objects);
|
2018-09-14 02:02:25 +08:00
|
|
|
for (i = 0; i < m->num_objects; i++) {
|
|
|
|
struct object_id oid;
|
|
|
|
struct pack_entry e;
|
|
|
|
off_t m_offset, p_offset;
|
|
|
|
|
2019-03-22 03:36:15 +08:00
|
|
|
if (i > 0 && pairs[i-1].pack_int_id != pairs[i].pack_int_id &&
|
|
|
|
m->packs[pairs[i-1].pack_int_id])
|
|
|
|
{
|
|
|
|
close_pack_fd(m->packs[pairs[i-1].pack_int_id]);
|
|
|
|
close_pack_index(m->packs[pairs[i-1].pack_int_id]);
|
|
|
|
}
|
|
|
|
|
|
|
|
nth_midxed_object_oid(&oid, m, pairs[i].pos);
|
|
|
|
|
2019-04-30 00:18:55 +08:00
|
|
|
if (!fill_midx_entry(r, &oid, &e, m)) {
|
2018-09-14 02:02:25 +08:00
|
|
|
midx_report(_("failed to load pack entry for oid[%d] = %s"),
|
2019-03-22 03:36:15 +08:00
|
|
|
pairs[i].pos, oid_to_hex(&oid));
|
2018-09-14 02:02:25 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (open_pack_index(e.p)) {
|
|
|
|
midx_report(_("failed to load pack-index for packfile %s"),
|
|
|
|
e.p->pack_name);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
m_offset = e.offset;
|
|
|
|
p_offset = find_pack_entry_one(oid.hash, e.p);
|
|
|
|
|
|
|
|
if (m_offset != p_offset)
|
|
|
|
midx_report(_("incorrect object offset for oid[%d] = %s: %"PRIx64" != %"PRIx64),
|
2019-03-22 03:36:15 +08:00
|
|
|
pairs[i].pos, oid_to_hex(&oid), m_offset, p_offset);
|
2018-09-14 02:02:26 +08:00
|
|
|
|
2019-03-22 03:36:14 +08:00
|
|
|
midx_display_sparse_progress(progress, i + 1);
|
2018-09-14 02:02:25 +08:00
|
|
|
}
|
2018-09-14 02:02:26 +08:00
|
|
|
stop_progress(&progress);
|
2018-09-14 02:02:25 +08:00
|
|
|
|
2019-03-22 03:36:15 +08:00
|
|
|
free(pairs);
|
|
|
|
|
2018-09-14 02:02:13 +08:00
|
|
|
return verify_midx_error;
|
|
|
|
}
|
2019-06-11 07:35:23 +08:00
|
|
|
|
2019-10-22 02:39:58 +08:00
|
|
|
int expire_midx_packs(struct repository *r, const char *object_dir, unsigned flags)
|
2019-06-11 07:35:23 +08:00
|
|
|
{
|
2019-06-11 07:35:25 +08:00
|
|
|
uint32_t i, *count, result = 0;
|
|
|
|
struct string_list packs_to_drop = STRING_LIST_INIT_DUP;
|
|
|
|
struct multi_pack_index *m = load_multi_pack_index(object_dir, 1);
|
2019-10-22 02:40:00 +08:00
|
|
|
struct progress *progress = NULL;
|
2019-06-11 07:35:25 +08:00
|
|
|
|
|
|
|
if (!m)
|
|
|
|
return 0;
|
|
|
|
|
2021-03-14 00:17:22 +08:00
|
|
|
CALLOC_ARRAY(count, m->num_packs);
|
2019-10-22 02:40:00 +08:00
|
|
|
|
|
|
|
if (flags & MIDX_PROGRESS)
|
2020-09-25 20:33:35 +08:00
|
|
|
progress = start_delayed_progress(_("Counting referenced objects"),
|
2019-10-22 02:40:00 +08:00
|
|
|
m->num_objects);
|
2019-06-11 07:35:25 +08:00
|
|
|
for (i = 0; i < m->num_objects; i++) {
|
|
|
|
int pack_int_id = nth_midxed_pack_int_id(m, i);
|
|
|
|
count[pack_int_id]++;
|
2019-10-22 02:40:00 +08:00
|
|
|
display_progress(progress, i + 1);
|
2019-06-11 07:35:25 +08:00
|
|
|
}
|
2019-10-22 02:40:00 +08:00
|
|
|
stop_progress(&progress);
|
2019-06-11 07:35:25 +08:00
|
|
|
|
2019-10-22 02:40:00 +08:00
|
|
|
if (flags & MIDX_PROGRESS)
|
2020-09-25 20:33:35 +08:00
|
|
|
progress = start_delayed_progress(_("Finding and deleting unreferenced packfiles"),
|
2019-10-22 02:40:00 +08:00
|
|
|
m->num_packs);
|
2019-06-11 07:35:25 +08:00
|
|
|
for (i = 0; i < m->num_packs; i++) {
|
|
|
|
char *pack_name;
|
2019-10-22 02:40:00 +08:00
|
|
|
display_progress(progress, i + 1);
|
2019-06-11 07:35:25 +08:00
|
|
|
|
|
|
|
if (count[i])
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (prepare_midx_pack(r, m, i))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (m->packs[i]->pack_keep)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
pack_name = xstrdup(m->packs[i]->pack_name);
|
|
|
|
close_pack(m->packs[i]);
|
|
|
|
|
|
|
|
string_list_insert(&packs_to_drop, m->pack_names[i]);
|
|
|
|
unlink_pack_path(pack_name, 0);
|
|
|
|
free(pack_name);
|
|
|
|
}
|
2019-10-22 02:40:00 +08:00
|
|
|
stop_progress(&progress);
|
2019-06-11 07:35:25 +08:00
|
|
|
|
|
|
|
free(count);
|
|
|
|
|
midx: avoid opening multiple MIDXs when writing
Opening multiple instance of the same MIDX can lead to problems like two
separate packed_git structures which represent the same pack being added
to the repository's object store.
The above scenario can happen because prepare_midx_pack() checks if
`m->packs[pack_int_id]` is NULL in order to determine if a pack has been
opened and installed in the repository before. But a caller can
construct two copies of the same MIDX by calling get_multi_pack_index()
and load_multi_pack_index() since the former manipulates the
object store directly but the latter is a lower-level routine which
allocates a new MIDX for each call.
So if prepare_midx_pack() is called on multiple MIDXs with the same
pack_int_id, then that pack will be installed twice in the object
store's packed_git pointer.
This can lead to problems in, for e.g., the pack-bitmap code, which does
something like the following (in pack-bitmap.c:open_pack_bitmap()):
struct bitmap_index *bitmap_git = ...;
for (p = get_all_packs(r); p; p = p->next) {
if (open_pack_bitmap_1(bitmap_git, p) == 0)
ret = 0;
}
which is a problem if two copies of the same pack exist in the
packed_git list because pack-bitmap.c:open_pack_bitmap_1() contains a
conditional like the following:
if (bitmap_git->pack || bitmap_git->midx) {
/* ignore extra bitmap file; we can only handle one */
warning("ignoring extra bitmap file: %s", packfile->pack_name);
close(fd);
return -1;
}
Avoid this scenario by not letting write_midx_internal() open a MIDX
that isn't also pointed at by the object store. So long as this is the
case, other routines should prefer to open MIDXs with
get_multi_pack_index() or reprepare_packed_git() instead of creating
instances on their own. Because get_multi_pack_index() returns
`r->object_store->multi_pack_index` if it is non-NULL, we'll only have
one instance of a MIDX open at one time, avoiding these problems.
To encourage this, drop the `struct multi_pack_index *` parameter from
`write_midx_internal()`, and rely instead on the `object_dir` to find
(or initialize) the correct MIDX instance.
Likewise, replace the call to `close_midx()` with
`close_object_store()`, since we're about to replace the MIDX with a new
one and should invalidate the object store's memory of any MIDX that
might have existed beforehand.
Note that this now forbids passing object directories that don't belong
to alternate repositories over `--object-dir`, since before we would
have happily opened a MIDX in any directory, but now restrict ourselves
to only those reachable by `r->objects->multi_pack_index` (and alternate
MIDXs that we can see by walking the `next` pointer).
As far as I can tell, supporting arbitrary directories with
`--object-dir` was a historical accident, since even the documentation
says `<alt>` when referring to the value passed to this option.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-02 04:34:01 +08:00
|
|
|
if (packs_to_drop.nr) {
|
|
|
|
result = write_midx_internal(object_dir, &packs_to_drop, NULL, flags);
|
|
|
|
m = NULL;
|
|
|
|
}
|
2019-06-11 07:35:25 +08:00
|
|
|
|
|
|
|
string_list_clear(&packs_to_drop, 0);
|
|
|
|
return result;
|
2019-06-11 07:35:23 +08:00
|
|
|
}
|
2019-06-11 07:35:26 +08:00
|
|
|
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
struct repack_info {
|
|
|
|
timestamp_t mtime;
|
|
|
|
uint32_t referenced_objects;
|
|
|
|
uint32_t pack_int_id;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int compare_by_mtime(const void *a_, const void *b_)
|
2019-06-11 07:35:26 +08:00
|
|
|
{
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
const struct repack_info *a, *b;
|
|
|
|
|
|
|
|
a = (const struct repack_info *)a_;
|
|
|
|
b = (const struct repack_info *)b_;
|
|
|
|
|
|
|
|
if (a->mtime < b->mtime)
|
|
|
|
return -1;
|
|
|
|
if (a->mtime > b->mtime)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-05-11 00:07:34 +08:00
|
|
|
static int fill_included_packs_all(struct repository *r,
|
|
|
|
struct multi_pack_index *m,
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
unsigned char *include_pack)
|
|
|
|
{
|
2020-05-11 00:07:34 +08:00
|
|
|
uint32_t i, count = 0;
|
|
|
|
int pack_kept_objects = 0;
|
|
|
|
|
|
|
|
repo_config_get_bool(r, "repack.packkeptobjects", &pack_kept_objects);
|
|
|
|
|
|
|
|
for (i = 0; i < m->num_packs; i++) {
|
|
|
|
if (prepare_midx_pack(r, m, i))
|
|
|
|
continue;
|
|
|
|
if (!pack_kept_objects && m->packs[i]->pack_keep)
|
|
|
|
continue;
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
|
|
|
|
include_pack[i] = 1;
|
2020-05-11 00:07:34 +08:00
|
|
|
count++;
|
|
|
|
}
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
|
2020-05-11 00:07:34 +08:00
|
|
|
return count < 2;
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int fill_included_packs_batch(struct repository *r,
|
|
|
|
struct multi_pack_index *m,
|
|
|
|
unsigned char *include_pack,
|
|
|
|
size_t batch_size)
|
|
|
|
{
|
|
|
|
uint32_t i, packs_to_repack;
|
|
|
|
size_t total_size;
|
|
|
|
struct repack_info *pack_info = xcalloc(m->num_packs, sizeof(struct repack_info));
|
2020-05-11 00:07:34 +08:00
|
|
|
int pack_kept_objects = 0;
|
|
|
|
|
|
|
|
repo_config_get_bool(r, "repack.packkeptobjects", &pack_kept_objects);
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
|
|
|
|
for (i = 0; i < m->num_packs; i++) {
|
|
|
|
pack_info[i].pack_int_id = i;
|
|
|
|
|
|
|
|
if (prepare_midx_pack(r, m, i))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
pack_info[i].mtime = m->packs[i]->mtime;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; batch_size && i < m->num_objects; i++) {
|
|
|
|
uint32_t pack_int_id = nth_midxed_pack_int_id(m, i);
|
|
|
|
pack_info[pack_int_id].referenced_objects++;
|
|
|
|
}
|
|
|
|
|
|
|
|
QSORT(pack_info, m->num_packs, compare_by_mtime);
|
|
|
|
|
|
|
|
total_size = 0;
|
|
|
|
packs_to_repack = 0;
|
|
|
|
for (i = 0; total_size < batch_size && i < m->num_packs; i++) {
|
|
|
|
int pack_int_id = pack_info[i].pack_int_id;
|
|
|
|
struct packed_git *p = m->packs[pack_int_id];
|
|
|
|
size_t expected_size;
|
|
|
|
|
|
|
|
if (!p)
|
|
|
|
continue;
|
2020-05-11 00:07:34 +08:00
|
|
|
if (!pack_kept_objects && p->pack_keep)
|
|
|
|
continue;
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
if (open_pack_index(p) || !p->num_objects)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
expected_size = (size_t)(p->pack_size
|
|
|
|
* pack_info[i].referenced_objects);
|
|
|
|
expected_size /= p->num_objects;
|
|
|
|
|
|
|
|
if (expected_size >= batch_size)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
packs_to_repack++;
|
|
|
|
total_size += expected_size;
|
|
|
|
include_pack[pack_int_id] = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
free(pack_info);
|
|
|
|
|
2020-08-11 23:30:18 +08:00
|
|
|
if (packs_to_repack < 2)
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
return 1;
|
|
|
|
|
2019-06-11 07:35:26 +08:00
|
|
|
return 0;
|
|
|
|
}
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
|
2019-10-22 02:39:58 +08:00
|
|
|
int midx_repack(struct repository *r, const char *object_dir, size_t batch_size, unsigned flags)
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
{
|
|
|
|
int result = 0;
|
|
|
|
uint32_t i;
|
|
|
|
unsigned char *include_pack;
|
|
|
|
struct child_process cmd = CHILD_PROCESS_INIT;
|
2020-08-13 00:52:54 +08:00
|
|
|
FILE *cmd_in;
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
struct strbuf base_name = STRBUF_INIT;
|
|
|
|
struct multi_pack_index *m = load_multi_pack_index(object_dir, 1);
|
|
|
|
|
2020-05-11 00:07:33 +08:00
|
|
|
/*
|
|
|
|
* When updating the default for these configuration
|
|
|
|
* variables in builtin/repack.c, these must be adjusted
|
|
|
|
* to match.
|
|
|
|
*/
|
|
|
|
int delta_base_offset = 1;
|
|
|
|
int use_delta_islands = 0;
|
|
|
|
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
if (!m)
|
|
|
|
return 0;
|
|
|
|
|
2021-03-14 00:17:22 +08:00
|
|
|
CALLOC_ARRAY(include_pack, m->num_packs);
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
|
|
|
|
if (batch_size) {
|
|
|
|
if (fill_included_packs_batch(r, m, include_pack, batch_size))
|
|
|
|
goto cleanup;
|
2020-05-11 00:07:34 +08:00
|
|
|
} else if (fill_included_packs_all(r, m, include_pack))
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
goto cleanup;
|
|
|
|
|
2020-05-11 00:07:33 +08:00
|
|
|
repo_config_get_bool(r, "repack.usedeltabaseoffset", &delta_base_offset);
|
|
|
|
repo_config_get_bool(r, "repack.usedeltaislands", &use_delta_islands);
|
|
|
|
|
2020-07-29 04:25:12 +08:00
|
|
|
strvec_push(&cmd.args, "pack-objects");
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
|
|
|
|
strbuf_addstr(&base_name, object_dir);
|
|
|
|
strbuf_addstr(&base_name, "/pack/pack");
|
2020-07-29 04:25:12 +08:00
|
|
|
strvec_push(&cmd.args, base_name.buf);
|
2019-10-22 02:40:02 +08:00
|
|
|
|
2020-05-11 00:07:33 +08:00
|
|
|
if (delta_base_offset)
|
2020-07-29 04:25:12 +08:00
|
|
|
strvec_push(&cmd.args, "--delta-base-offset");
|
2020-05-11 00:07:33 +08:00
|
|
|
if (use_delta_islands)
|
2020-07-29 04:25:12 +08:00
|
|
|
strvec_push(&cmd.args, "--delta-islands");
|
2020-05-11 00:07:33 +08:00
|
|
|
|
2019-10-22 02:40:02 +08:00
|
|
|
if (flags & MIDX_PROGRESS)
|
2020-07-29 04:25:12 +08:00
|
|
|
strvec_push(&cmd.args, "--progress");
|
2019-10-22 02:40:02 +08:00
|
|
|
else
|
2020-07-29 04:25:12 +08:00
|
|
|
strvec_push(&cmd.args, "-q");
|
2019-10-22 02:40:02 +08:00
|
|
|
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
strbuf_release(&base_name);
|
|
|
|
|
|
|
|
cmd.git_cmd = 1;
|
|
|
|
cmd.in = cmd.out = -1;
|
|
|
|
|
|
|
|
if (start_command(&cmd)) {
|
|
|
|
error(_("could not start pack-objects"));
|
|
|
|
result = 1;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
2020-08-13 00:52:54 +08:00
|
|
|
cmd_in = xfdopen(cmd.in, "w");
|
|
|
|
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
for (i = 0; i < m->num_objects; i++) {
|
|
|
|
struct object_id oid;
|
|
|
|
uint32_t pack_int_id = nth_midxed_pack_int_id(m, i);
|
|
|
|
|
|
|
|
if (!include_pack[pack_int_id])
|
|
|
|
continue;
|
|
|
|
|
|
|
|
nth_midxed_object_oid(&oid, m, i);
|
2020-08-13 00:52:54 +08:00
|
|
|
fprintf(cmd_in, "%s\n", oid_to_hex(&oid));
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
}
|
2020-08-13 00:52:54 +08:00
|
|
|
fclose(cmd_in);
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
|
|
|
|
if (finish_command(&cmd)) {
|
|
|
|
error(_("could not finish pack-objects"));
|
|
|
|
result = 1;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
midx: avoid opening multiple MIDXs when writing
Opening multiple instance of the same MIDX can lead to problems like two
separate packed_git structures which represent the same pack being added
to the repository's object store.
The above scenario can happen because prepare_midx_pack() checks if
`m->packs[pack_int_id]` is NULL in order to determine if a pack has been
opened and installed in the repository before. But a caller can
construct two copies of the same MIDX by calling get_multi_pack_index()
and load_multi_pack_index() since the former manipulates the
object store directly but the latter is a lower-level routine which
allocates a new MIDX for each call.
So if prepare_midx_pack() is called on multiple MIDXs with the same
pack_int_id, then that pack will be installed twice in the object
store's packed_git pointer.
This can lead to problems in, for e.g., the pack-bitmap code, which does
something like the following (in pack-bitmap.c:open_pack_bitmap()):
struct bitmap_index *bitmap_git = ...;
for (p = get_all_packs(r); p; p = p->next) {
if (open_pack_bitmap_1(bitmap_git, p) == 0)
ret = 0;
}
which is a problem if two copies of the same pack exist in the
packed_git list because pack-bitmap.c:open_pack_bitmap_1() contains a
conditional like the following:
if (bitmap_git->pack || bitmap_git->midx) {
/* ignore extra bitmap file; we can only handle one */
warning("ignoring extra bitmap file: %s", packfile->pack_name);
close(fd);
return -1;
}
Avoid this scenario by not letting write_midx_internal() open a MIDX
that isn't also pointed at by the object store. So long as this is the
case, other routines should prefer to open MIDXs with
get_multi_pack_index() or reprepare_packed_git() instead of creating
instances on their own. Because get_multi_pack_index() returns
`r->object_store->multi_pack_index` if it is non-NULL, we'll only have
one instance of a MIDX open at one time, avoiding these problems.
To encourage this, drop the `struct multi_pack_index *` parameter from
`write_midx_internal()`, and rely instead on the `object_dir` to find
(or initialize) the correct MIDX instance.
Likewise, replace the call to `close_midx()` with
`close_object_store()`, since we're about to replace the MIDX with a new
one and should invalidate the object store's memory of any MIDX that
might have existed beforehand.
Note that this now forbids passing object directories that don't belong
to alternate repositories over `--object-dir`, since before we would
have happily opened a MIDX in any directory, but now restrict ourselves
to only those reachable by `r->objects->multi_pack_index` (and alternate
MIDXs that we can see by walking the `next` pointer).
As far as I can tell, supporting arbitrary directories with
`--object-dir` was a historical accident, since even the documentation
says `<alt>` when referring to the value passed to this option.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-09-02 04:34:01 +08:00
|
|
|
result = write_midx_internal(object_dir, NULL, NULL, flags);
|
midx: implement midx_repack()
To repack with a non-zero batch-size, first sort all pack-files by
their modified time. Second, walk those pack-files from oldest
to newest, compute their expected size, and add the packs to a list
if they are smaller than the given batch-size. Stop when the total
expected size is at least the batch size.
If the batch size is zero, select all packs in the multi-pack-index.
Finally, collect the objects from the multi-pack-index that are in
the selected packs and send them to 'git pack-objects'. Write a new
multi-pack-index that includes the new pack.
Using a batch size of zero is very similar to a standard 'git repack'
command, except that we do not delete the old packs and instead rely
on the new multi-pack-index to prevent new processes from reading the
old packs. This does not disrupt other Git processes that are currently
reading the old packs based on the old multi-pack-index.
While first designing a 'git multi-pack-index repack' operation, I
started by collecting the batches based on the actual size of the
objects instead of the size of the pack-files. This allows repacking
a large pack-file that has very few referencd objects. However, this
came at a significant cost of parsing pack-files instead of simply
reading the multi-pack-index and getting the file information for
the pack-files. The "expected size" version provides similar
behavior, but could skip a pack-file if the average object size is
much larger than the actual size of the referenced objects, or
can create a large pack if the actual size of the referenced objects
is larger than the expected size.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-11 07:35:27 +08:00
|
|
|
m = NULL;
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
if (m)
|
|
|
|
close_midx(m);
|
|
|
|
free(include_pack);
|
|
|
|
return result;
|
|
|
|
}
|