Merge branch 'tb/pack-revindex-api'

Abstract accesses to in-core revindex that allows enumerating
objects stored in a packfile in the order they appear in the pack,
in preparation for introducing an on-disk precomputed revindex.

* tb/pack-revindex-api: (21 commits)
  for_each_object_in_pack(): clarify pack vs index ordering
  pack-revindex.c: avoid direct revindex access in 'offset_to_pack_pos()'
  pack-revindex: hide the definition of 'revindex_entry'
  pack-revindex: remove unused 'find_revindex_position()'
  pack-revindex: remove unused 'find_pack_revindex()'
  builtin/gc.c: guess the size of the revindex
  for_each_object_in_pack(): convert to new revindex API
  unpack_entry(): convert to new revindex API
  packed_object_info(): convert to new revindex API
  retry_bad_packed_offset(): convert to new revindex API
  get_delta_base_oid(): convert to new revindex API
  rebuild_existing_bitmaps(): convert to new revindex API
  try_partial_reuse(): convert to new revindex API
  get_size_by_pos(): convert to new revindex API
  show_objects_for_type(): convert to new revindex API
  bitmap_position_packfile(): convert to new revindex API
  check_object(): convert to new revindex API
  write_reused_pack_verbatim(): convert to new revindex API
  write_reused_pack_one(): convert to new revindex API
  write_reuse_object(): convert to new revindex API
  ...
This commit is contained in:
Junio C Hamano 2021-01-25 14:19:19 -08:00
commit bcaaf972e6
6 changed files with 186 additions and 86 deletions

View File

@ -301,7 +301,7 @@ static uint64_t estimate_repack_memory(struct packed_git *pack)
/* and then obj_hash[], underestimated in fact */
heap += sizeof(struct object *) * nr_objects;
/* revindex is used also */
heap += sizeof(struct revindex_entry) * nr_objects;
heap += (sizeof(off_t) + sizeof(uint32_t)) * nr_objects;
/*
* read_sha1_file() (either at delta calculation phase, or
* writing phase) also fills up the delta base cache

View File

@ -419,7 +419,7 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
{
struct packed_git *p = IN_PACK(entry);
struct pack_window *w_curs = NULL;
struct revindex_entry *revidx;
uint32_t pos;
off_t offset;
enum object_type type = oe_type(entry);
off_t datalen;
@ -436,10 +436,15 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
type, entry_size);
offset = entry->in_pack_offset;
revidx = find_pack_revindex(p, offset);
datalen = revidx[1].offset - offset;
if (offset_to_pack_pos(p, offset, &pos) < 0)
die(_("write_reuse_object: could not locate %s, expected at "
"offset %"PRIuMAX" in pack %s"),
oid_to_hex(&entry->idx.oid), (uintmax_t)offset,
p->pack_name);
datalen = pack_pos_to_offset(p, pos + 1) - offset;
if (!pack_to_stdout && p->index_version > 1 &&
check_pack_crc(p, &w_curs, offset, datalen, revidx->nr)) {
check_pack_crc(p, &w_curs, offset, datalen,
pack_pos_to_index(p, pos))) {
error(_("bad packed object CRC for %s"),
oid_to_hex(&entry->idx.oid));
unuse_pack(&w_curs);
@ -863,8 +868,8 @@ static void write_reused_pack_one(size_t pos, struct hashfile *out,
enum object_type type;
unsigned long size;
offset = reuse_packfile->revindex[pos].offset;
next = reuse_packfile->revindex[pos + 1].offset;
offset = pack_pos_to_offset(reuse_packfile, pos);
next = pack_pos_to_offset(reuse_packfile, pos + 1);
record_reused_object(offset, offset - hashfile_total(out));
@ -884,11 +889,17 @@ static void write_reused_pack_one(size_t pos, struct hashfile *out,
/* Convert to REF_DELTA if we must... */
if (!allow_ofs_delta) {
int base_pos = find_revindex_position(reuse_packfile, base_offset);
uint32_t base_pos;
struct object_id base_oid;
if (offset_to_pack_pos(reuse_packfile, base_offset, &base_pos) < 0)
die(_("expected object at offset %"PRIuMAX" "
"in pack %s"),
(uintmax_t)base_offset,
reuse_packfile->pack_name);
nth_packed_object_id(&base_oid, reuse_packfile,
reuse_packfile->revindex[base_pos].nr);
pack_pos_to_index(reuse_packfile, base_pos));
len = encode_in_pack_object_header(header, sizeof(header),
OBJ_REF_DELTA, size);
@ -941,7 +952,7 @@ static size_t write_reused_pack_verbatim(struct hashfile *out,
off_t to_write;
written = (pos * BITS_IN_EWORD);
to_write = reuse_packfile->revindex[written].offset
to_write = pack_pos_to_offset(reuse_packfile, written)
- sizeof(struct pack_header);
/* We're recording one chunk, not one object. */
@ -1806,11 +1817,11 @@ static void check_object(struct object_entry *entry, uint32_t object_index)
goto give_up;
}
if (reuse_delta && !entry->preferred_base) {
struct revindex_entry *revidx;
revidx = find_pack_revindex(p, ofs);
if (!revidx)
uint32_t pos;
if (offset_to_pack_pos(p, ofs, &pos) < 0)
goto give_up;
if (!nth_packed_object_id(&base_ref, p, revidx->nr))
if (!nth_packed_object_id(&base_ref, p,
pack_pos_to_index(p, pos)))
have_base = 1;
}
entry->in_pack_header_size = used + used_0;

View File

@ -407,11 +407,14 @@ static inline int bitmap_position_extended(struct bitmap_index *bitmap_git,
static inline int bitmap_position_packfile(struct bitmap_index *bitmap_git,
const struct object_id *oid)
{
uint32_t pos;
off_t offset = find_pack_entry_one(oid->hash, bitmap_git->pack);
if (!offset)
return -1;
return find_revindex_position(bitmap_git->pack, offset);
if (offset_to_pack_pos(bitmap_git->pack, offset, &pos) < 0)
return -1;
return pos;
}
static int bitmap_position(struct bitmap_index *bitmap_git,
@ -708,21 +711,22 @@ static void show_objects_for_type(
for (offset = 0; offset < BITS_IN_EWORD; ++offset) {
struct object_id oid;
struct revindex_entry *entry;
uint32_t hash = 0;
uint32_t hash = 0, index_pos;
off_t ofs;
if ((word >> offset) == 0)
break;
offset += ewah_bit_ctz64(word >> offset);
entry = &bitmap_git->pack->revindex[pos + offset];
nth_packed_object_id(&oid, bitmap_git->pack, entry->nr);
index_pos = pack_pos_to_index(bitmap_git->pack, pos + offset);
ofs = pack_pos_to_offset(bitmap_git->pack, pos + offset);
nth_packed_object_id(&oid, bitmap_git->pack, index_pos);
if (bitmap_git->hashes)
hash = get_be32(bitmap_git->hashes + entry->nr);
hash = get_be32(bitmap_git->hashes + index_pos);
show_reach(&oid, object_type, 0, hash, bitmap_git->pack, entry->offset);
show_reach(&oid, object_type, 0, hash, bitmap_git->pack, ofs);
}
}
}
@ -831,11 +835,11 @@ static unsigned long get_size_by_pos(struct bitmap_index *bitmap_git,
oi.sizep = &size;
if (pos < pack->num_objects) {
struct revindex_entry *entry = &pack->revindex[pos];
if (packed_object_info(the_repository, pack,
entry->offset, &oi) < 0) {
off_t ofs = pack_pos_to_offset(pack, pos);
if (packed_object_info(the_repository, pack, ofs, &oi) < 0) {
struct object_id oid;
nth_packed_object_id(&oid, pack, entry->nr);
nth_packed_object_id(&oid, pack,
pack_pos_to_index(pack, pos));
die(_("unable to get size of %s"), oid_to_hex(&oid));
}
} else {
@ -1065,23 +1069,21 @@ static void try_partial_reuse(struct bitmap_index *bitmap_git,
struct bitmap *reuse,
struct pack_window **w_curs)
{
struct revindex_entry *revidx;
off_t offset;
off_t offset, header;
enum object_type type;
unsigned long size;
if (pos >= bitmap_git->pack->num_objects)
return; /* not actually in the pack */
revidx = &bitmap_git->pack->revindex[pos];
offset = revidx->offset;
offset = header = pack_pos_to_offset(bitmap_git->pack, pos);
type = unpack_object_header(bitmap_git->pack, w_curs, &offset, &size);
if (type < 0)
return; /* broken packfile, punt */
if (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA) {
off_t base_offset;
int base_pos;
uint32_t base_pos;
/*
* Find the position of the base object so we can look it up
@ -1092,11 +1094,10 @@ static void try_partial_reuse(struct bitmap_index *bitmap_git,
* more detail.
*/
base_offset = get_delta_base(bitmap_git->pack, w_curs,
&offset, type, revidx->offset);
&offset, type, header);
if (!base_offset)
return;
base_pos = find_revindex_position(bitmap_git->pack, base_offset);
if (base_pos < 0)
if (offset_to_pack_pos(bitmap_git->pack, base_offset, &base_pos) < 0)
return;
/*
@ -1391,11 +1392,10 @@ uint32_t *create_bitmap_mapping(struct bitmap_index *bitmap_git,
for (i = 0; i < num_objects; ++i) {
struct object_id oid;
struct revindex_entry *entry;
struct object_entry *oe;
entry = &bitmap_git->pack->revindex[i];
nth_packed_object_id(&oid, bitmap_git->pack, entry->nr);
nth_packed_object_id(&oid, bitmap_git->pack,
pack_pos_to_index(bitmap_git->pack, i));
oe = packlist_find(mapping, &oid);
if (oe)

View File

@ -3,6 +3,11 @@
#include "object-store.h"
#include "packfile.h"
struct revindex_entry {
off_t offset;
unsigned int nr;
};
/*
* Pack index for existing packs give us easy access to the offsets into
* corresponding pack file where each object's data starts, but the entries
@ -169,17 +174,24 @@ int load_pack_revindex(struct packed_git *p)
return 0;
}
int find_revindex_position(struct packed_git *p, off_t ofs)
int offset_to_pack_pos(struct packed_git *p, off_t ofs, uint32_t *pos)
{
int lo = 0;
int hi = p->num_objects + 1;
const struct revindex_entry *revindex = p->revindex;
unsigned lo, hi;
if (load_pack_revindex(p) < 0)
return -1;
lo = 0;
hi = p->num_objects + 1;
do {
const unsigned mi = lo + (hi - lo) / 2;
if (revindex[mi].offset == ofs) {
return mi;
} else if (ofs < revindex[mi].offset)
off_t got = pack_pos_to_offset(p, mi);
if (got == ofs) {
*pos = mi;
return 0;
} else if (ofs < got)
hi = mi;
else
lo = mi + 1;
@ -189,17 +201,20 @@ int find_revindex_position(struct packed_git *p, off_t ofs)
return -1;
}
struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs)
uint32_t pack_pos_to_index(struct packed_git *p, uint32_t pos)
{
int pos;
if (load_pack_revindex(p))
return NULL;
pos = find_revindex_position(p, ofs);
if (pos < 0)
return NULL;
return p->revindex + pos;
if (!p->revindex)
BUG("pack_pos_to_index: reverse index not yet loaded");
if (p->num_objects <= pos)
BUG("pack_pos_to_index: out-of-bounds object at %"PRIu32, pos);
return p->revindex[pos].nr;
}
off_t pack_pos_to_offset(struct packed_git *p, uint32_t pos)
{
if (!p->revindex)
BUG("pack_pos_to_index: reverse index not yet loaded");
if (p->num_objects < pos)
BUG("pack_pos_to_offset: out-of-bounds object at %"PRIu32, pos);
return p->revindex[pos].offset;
}

View File

@ -1,16 +1,62 @@
#ifndef PACK_REVINDEX_H
#define PACK_REVINDEX_H
/**
* A revindex allows converting efficiently between three properties
* of an object within a pack:
*
* - index position: the numeric position within the list of sorted object ids
* found in the .idx file
*
* - pack position: the numeric position within the list of objects in their
* order within the actual .pack file (i.e., 0 is the first object in the
* .pack, 1 is the second, and so on)
*
* - offset: the byte offset within the .pack file at which the object contents
* can be found
*/
struct packed_git;
struct revindex_entry {
off_t offset;
unsigned int nr;
};
/*
* load_pack_revindex populates the revindex's internal data-structures for the
* given pack, returning zero on success and a negative value otherwise.
*/
int load_pack_revindex(struct packed_git *p);
int find_revindex_position(struct packed_git *p, off_t ofs);
struct revindex_entry *find_pack_revindex(struct packed_git *p, off_t ofs);
/*
* offset_to_pack_pos converts an object offset to a pack position. This
* function returns zero on success, and a negative number otherwise. The
* parameter 'pos' is usable only on success.
*
* If the reverse index has not yet been loaded, this function loads it lazily,
* and returns an negative number if an error was encountered.
*
* This function runs in time O(log N) with the number of objects in the pack.
*/
int offset_to_pack_pos(struct packed_git *p, off_t ofs, uint32_t *pos);
/*
* pack_pos_to_index converts the given pack-relative position 'pos' by
* returning an index-relative position.
*
* If the reverse index has not yet been loaded, or the position is out of
* bounds, this function aborts.
*
* This function runs in constant time.
*/
uint32_t pack_pos_to_index(struct packed_git *p, uint32_t pos);
/*
* pack_pos_to_offset converts the given pack-relative position 'pos' into a
* pack offset. For a pack with 'N' objects, asking for position 'N' will return
* the total size (in bytes) of the pack.
*
* If the reverse index has not yet been loaded, or the position is out of
* bounds, this function aborts.
*
* This function runs in constant time.
*/
off_t pack_pos_to_offset(struct packed_git *p, uint32_t pos);
#endif

View File

@ -1235,18 +1235,18 @@ static int get_delta_base_oid(struct packed_git *p,
oidread(oid, base);
return 0;
} else if (type == OBJ_OFS_DELTA) {
struct revindex_entry *revidx;
uint32_t base_pos;
off_t base_offset = get_delta_base(p, w_curs, &curpos,
type, delta_obj_offset);
if (!base_offset)
return -1;
revidx = find_pack_revindex(p, base_offset);
if (!revidx)
if (offset_to_pack_pos(p, base_offset, &base_pos) < 0)
return -1;
return nth_packed_object_id(oid, p, revidx->nr);
return nth_packed_object_id(oid, p,
pack_pos_to_index(p, base_pos));
} else
return -1;
}
@ -1256,12 +1256,11 @@ static int retry_bad_packed_offset(struct repository *r,
off_t obj_offset)
{
int type;
struct revindex_entry *revidx;
uint32_t pos;
struct object_id oid;
revidx = find_pack_revindex(p, obj_offset);
if (!revidx)
if (offset_to_pack_pos(p, obj_offset, &pos) < 0)
return OBJ_BAD;
nth_packed_object_id(&oid, p, revidx->nr);
nth_packed_object_id(&oid, p, pack_pos_to_index(p, pos));
mark_bad_packed_object(p, oid.hash);
type = oid_object_info(r, &oid, NULL);
if (type <= OBJ_NONE)
@ -1538,8 +1537,15 @@ int packed_object_info(struct repository *r, struct packed_git *p,
}
if (oi->disk_sizep) {
struct revindex_entry *revidx = find_pack_revindex(p, obj_offset);
*oi->disk_sizep = revidx[1].offset - obj_offset;
uint32_t pos;
if (offset_to_pack_pos(p, obj_offset, &pos) < 0) {
error("could not find object at offset %"PRIuMAX" "
"in pack %s", (uintmax_t)obj_offset, p->pack_name);
type = OBJ_BAD;
goto out;
}
*oi->disk_sizep = pack_pos_to_offset(p, pos + 1) - obj_offset;
}
if (oi->typep || oi->type_name) {
@ -1688,11 +1694,21 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset,
}
if (do_check_packed_object_crc && p->index_version > 1) {
struct revindex_entry *revidx = find_pack_revindex(p, obj_offset);
off_t len = revidx[1].offset - obj_offset;
if (check_pack_crc(p, &w_curs, obj_offset, len, revidx->nr)) {
uint32_t pack_pos, index_pos;
off_t len;
if (offset_to_pack_pos(p, obj_offset, &pack_pos) < 0) {
error("could not find object at offset %"PRIuMAX" in pack %s",
(uintmax_t)obj_offset, p->pack_name);
data = NULL;
goto out;
}
len = pack_pos_to_offset(p, pack_pos + 1) - obj_offset;
index_pos = pack_pos_to_index(p, pack_pos);
if (check_pack_crc(p, &w_curs, obj_offset, len, index_pos)) {
struct object_id oid;
nth_packed_object_id(&oid, p, revidx->nr);
nth_packed_object_id(&oid, p, index_pos);
error("bad packed object CRC for %s",
oid_to_hex(&oid));
mark_bad_packed_object(p, oid.hash);
@ -1775,11 +1791,11 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset,
* This is costly but should happen only in the presence
* of a corrupted pack, and is better than failing outright.
*/
struct revindex_entry *revidx;
uint32_t pos;
struct object_id base_oid;
revidx = find_pack_revindex(p, obj_offset);
if (revidx) {
nth_packed_object_id(&base_oid, p, revidx->nr);
if (!(offset_to_pack_pos(p, obj_offset, &pos))) {
nth_packed_object_id(&base_oid, p,
pack_pos_to_index(p, pos));
error("failed to read delta base object %s"
" at offset %"PRIuMAX" from %s",
oid_to_hex(&base_oid), (uintmax_t)obj_offset,
@ -2066,19 +2082,31 @@ int for_each_object_in_pack(struct packed_git *p,
}
for (i = 0; i < p->num_objects; i++) {
uint32_t pos;
uint32_t index_pos;
struct object_id oid;
/*
* We are iterating "i" from 0 up to num_objects, but its
* meaning may be different, depending on the requested output
* order:
*
* - in object-name order, it is the same as the index order
* used by nth_packed_object_id(), so we can pass it
* directly
*
* - in pack-order, it is pack position, which we must
* convert to an index position in order to get the oid.
*/
if (flags & FOR_EACH_OBJECT_PACK_ORDER)
pos = p->revindex[i].nr;
index_pos = pack_pos_to_index(p, i);
else
pos = i;
index_pos = i;
if (nth_packed_object_id(&oid, p, pos) < 0)
if (nth_packed_object_id(&oid, p, index_pos) < 0)
return error("unable to get sha1 of object %u in %s",
pos, p->pack_name);
index_pos, p->pack_name);
r = cb(&oid, p, pos, data);
r = cb(&oid, p, index_pos, data);
if (r)
break;
}