midx: write object offsets

The final pair of chunks for the multi-pack-index file stores the object
offsets. We default to using 32-bit offsets as in the pack-index version
1 format, but if there exists an offset larger than 32-bits, we use a
trick similar to the pack-index version 2 format by storing all offsets
at least 2^31 in a 64-bit table; we use the 32-bit table to point into
that 64-bit table as necessary.

We only store these 64-bit offsets if necessary, so create a test that
manipulates a version 2 pack-index to fake a large offset. This allows
us to test that the large offset table is created, but the data does not
match the actual packfile offsets. The multi-pack-index offset does match
the (corrupted) pack-index offset, so a future feature will compare these
offsets during a 'verify' step.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Derrick Stolee 2018-07-12 15:39:32 -04:00 committed by Junio C Hamano
parent d7cacf29cc
commit 662148c435
5 changed files with 155 additions and 15 deletions

View File

@ -311,7 +311,20 @@ CHUNK DATA:
The OIDs for all objects in the MIDX are stored in lexicographic
order in this chunk.
(This section intentionally left incomplete.)
Object Offsets (ID: {'O', 'O', 'F', 'F'})
Stores two 4-byte values for every object.
1: The pack-int-id for the pack storing this object.
2: The offset within the pack.
If all offsets are less than 2^31, then the large offset chunk
will not exist and offsets are stored as in IDX v1.
If there is at least one offset value larger than 2^32-1, then
the large offset chunk must exist. If the large offset chunk
exists and the 31st bit is on, then removing that bit reveals
the row in the large offsets containing the 8-byte offset of
this object.
[Optional] Object Large Offsets (ID: {'L', 'O', 'F', 'F'})
8-byte offsets into large packfiles.
TRAILER:

100
midx.c
View File

@ -18,13 +18,18 @@
#define MIDX_HASH_LEN 20
#define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
#define MIDX_MAX_CHUNKS 3
#define MIDX_MAX_CHUNKS 5
#define MIDX_CHUNK_ALIGNMENT 4
#define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
#define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
#define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
#define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */
#define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */
#define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
#define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256)
#define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t))
#define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t))
#define MIDX_LARGE_OFFSET_NEEDED 0x80000000
static char *get_midx_filename(const char *object_dir)
{
@ -112,6 +117,14 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir)
m->chunk_oid_lookup = m->data + chunk_offset;
break;
case MIDX_CHUNKID_OBJECTOFFSETS:
m->chunk_object_offsets = m->data + chunk_offset;
break;
case MIDX_CHUNKID_LARGEOFFSETS:
m->chunk_large_offsets = m->data + chunk_offset;
break;
case 0:
die(_("terminating multi-pack-index chunk id appears earlier than expected"));
break;
@ -131,6 +144,8 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir)
die(_("multi-pack-index missing required OID fanout chunk"));
if (!m->chunk_oid_lookup)
die(_("multi-pack-index missing required OID lookup chunk"));
if (!m->chunk_object_offsets)
die(_("multi-pack-index missing required object offsets chunk"));
m->num_objects = ntohl(m->chunk_oid_fanout[255]);
@ -454,6 +469,56 @@ static size_t write_midx_oid_lookup(struct hashfile *f, unsigned char hash_len,
return written;
}
static size_t write_midx_object_offsets(struct hashfile *f, int large_offset_needed,
struct pack_midx_entry *objects, uint32_t nr_objects)
{
struct pack_midx_entry *list = objects;
uint32_t i, nr_large_offset = 0;
size_t written = 0;
for (i = 0; i < nr_objects; i++) {
struct pack_midx_entry *obj = list++;
hashwrite_be32(f, obj->pack_int_id);
if (large_offset_needed && obj->offset >> 31)
hashwrite_be32(f, MIDX_LARGE_OFFSET_NEEDED | nr_large_offset++);
else if (!large_offset_needed && obj->offset >> 32)
BUG("object %s requires a large offset (%"PRIx64") but the MIDX is not writing large offsets!",
oid_to_hex(&obj->oid),
obj->offset);
else
hashwrite_be32(f, (uint32_t)obj->offset);
written += MIDX_CHUNK_OFFSET_WIDTH;
}
return written;
}
static size_t write_midx_large_offsets(struct hashfile *f, uint32_t nr_large_offset,
struct pack_midx_entry *objects, uint32_t nr_objects)
{
struct pack_midx_entry *list = objects;
size_t written = 0;
while (nr_large_offset) {
struct pack_midx_entry *obj = list++;
uint64_t offset = obj->offset;
if (!(offset >> 31))
continue;
hashwrite_be32(f, offset >> 32);
hashwrite_be32(f, offset & 0xffffffffUL);
written += 2 * sizeof(uint32_t);
nr_large_offset--;
}
return written;
}
int write_midx_file(const char *object_dir)
{
unsigned char cur_chunk, num_chunks = 0;
@ -466,8 +531,9 @@ int write_midx_file(const char *object_dir)
uint64_t written = 0;
uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1];
uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1];
uint32_t nr_entries;
uint32_t nr_entries, num_large_offsets = 0;
struct pack_midx_entry *entries = NULL;
int large_offsets_needed = 0;
midx_name = get_midx_filename(object_dir);
if (safe_create_leading_directories(midx_name)) {
@ -494,13 +560,19 @@ int write_midx_file(const char *object_dir)
sort_packs_by_name(packs.names, packs.nr, pack_perm);
entries = get_sorted_entries(packs.list, pack_perm, packs.nr, &nr_entries);
for (i = 0; i < nr_entries; i++) {
if (entries[i].offset > 0x7fffffff)
num_large_offsets++;
if (entries[i].offset > 0xffffffff)
large_offsets_needed = 1;
}
hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
FREE_AND_NULL(midx_name);
cur_chunk = 0;
num_chunks = 3;
num_chunks = large_offsets_needed ? 5 : 4;
written = write_midx_header(f, num_chunks, packs.nr);
@ -516,9 +588,21 @@ int write_midx_file(const char *object_dir)
chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + MIDX_CHUNK_FANOUT_SIZE;
cur_chunk++;
chunk_ids[cur_chunk] = 0;
chunk_ids[cur_chunk] = MIDX_CHUNKID_OBJECTOFFSETS;
chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * MIDX_HASH_LEN;
cur_chunk++;
chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * MIDX_CHUNK_OFFSET_WIDTH;
if (large_offsets_needed) {
chunk_ids[cur_chunk] = MIDX_CHUNKID_LARGEOFFSETS;
cur_chunk++;
chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] +
num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH;
}
chunk_ids[cur_chunk] = 0;
for (i = 0; i <= num_chunks; i++) {
if (i && chunk_offsets[i] < chunk_offsets[i - 1])
BUG("incorrect chunk offsets: %"PRIu64" before %"PRIu64,
@ -556,6 +640,14 @@ int write_midx_file(const char *object_dir)
written += write_midx_oid_lookup(f, MIDX_HASH_LEN, entries, nr_entries);
break;
case MIDX_CHUNKID_OBJECTOFFSETS:
written += write_midx_object_offsets(f, large_offsets_needed, entries, nr_entries);
break;
case MIDX_CHUNKID_LARGEOFFSETS:
written += write_midx_large_offsets(f, num_large_offsets, entries, nr_entries);
break;
default:
BUG("trying to write unknown chunk id %"PRIx32,
chunk_ids[i]);

2
midx.h
View File

@ -17,6 +17,8 @@ struct multi_pack_index {
const unsigned char *chunk_pack_names;
const uint32_t *chunk_oid_fanout;
const unsigned char *chunk_oid_lookup;
const unsigned char *chunk_object_offsets;
const unsigned char *chunk_large_offsets;
const char **pack_names;
char object_dir[FLEX_ARRAY];

View File

@ -26,6 +26,10 @@ static int read_midx_file(const char *object_dir)
printf(" oid-fanout");
if (m->chunk_oid_lookup)
printf(" oid-lookup");
if (m->chunk_object_offsets)
printf(" object-offsets");
if (m->chunk_large_offsets)
printf(" large-offsets");
printf("\nnum_objects: %d\n", m->num_objects);

View File

@ -6,27 +6,30 @@ test_description='multi-pack-indexes'
midx_read_expect () {
NUM_PACKS=$1
NUM_OBJECTS=$2
NUM_CHUNKS=$3
OBJECT_DIR=$4
EXTRA_CHUNKS="$5"
{
cat <<-EOF &&
header: 4d494458 1 3 $NUM_PACKS
chunks: pack-names oid-fanout oid-lookup
header: 4d494458 1 $NUM_CHUNKS $NUM_PACKS
chunks: pack-names oid-fanout oid-lookup object-offsets$EXTRA_CHUNKS
num_objects: $NUM_OBJECTS
packs:
EOF
if test $NUM_PACKS -ge 1
then
ls pack/ | grep idx | sort
ls $OBJECT_DIR/pack/ | grep idx | sort
fi &&
printf "object-dir: .\n"
printf "object-dir: $OBJECT_DIR\n"
} >expect &&
test-tool read-midx . >actual &&
test-tool read-midx $OBJECT_DIR >actual &&
test_cmp expect actual
}
test_expect_success 'write midx with no packs' '
test_when_finished rm -f pack/multi-pack-index &&
git multi-pack-index --object-dir=. write &&
midx_read_expect 0 0
midx_read_expect 0 0 4 .
'
generate_objects () {
@ -76,13 +79,13 @@ test_expect_success 'write midx with one v1 pack' '
pack=$(git pack-objects --index-version=1 pack/test <obj-list) &&
test_when_finished rm pack/test-$pack.pack pack/test-$pack.idx pack/multi-pack-index &&
git multi-pack-index --object-dir=. write &&
midx_read_expect 1 18
midx_read_expect 1 18 4 .
'
test_expect_success 'write midx with one v2 pack' '
git pack-objects --index-version=2,0x40 pack/test <obj-list &&
git multi-pack-index --object-dir=. write &&
midx_read_expect 1 18
midx_read_expect 1 18 4 .
'
test_expect_success 'add more objects' '
@ -96,7 +99,7 @@ test_expect_success 'add more objects' '
test_expect_success 'write midx with two packs' '
git pack-objects --index-version=1 pack/test-2 <obj-list &&
git multi-pack-index --object-dir=. write &&
midx_read_expect 2 34
midx_read_expect 2 34 4 .
'
test_expect_success 'add more packs' '
@ -110,7 +113,33 @@ test_expect_success 'add more packs' '
test_expect_success 'write midx with twelve packs' '
git multi-pack-index --object-dir=. write &&
midx_read_expect 12 74
midx_read_expect 12 74 4 .
'
# usage: corrupt_data <file> <pos> [<data>]
corrupt_data () {
file=$1
pos=$2
data="${3:-\0}"
printf "$data" | dd of="$file" bs=1 seek="$pos" conv=notrunc
}
# Force 64-bit offsets by manipulating the idx file.
# This makes the IDX file _incorrect_ so be careful to clean up after!
test_expect_success 'force some 64-bit offsets with pack-objects' '
mkdir objects64 &&
mkdir objects64/pack &&
for i in $(test_seq 1 11)
do
generate_objects 11
done &&
commit_and_list_objects &&
pack64=$(git pack-objects --index-version=2,0x40 objects64/pack/test-64 <obj-list) &&
idx64=objects64/pack/test-64-$pack64.idx &&
chmod u+w $idx64 &&
corrupt_data $idx64 2999 "\02" &&
midx64=$(git multi-pack-index --object-dir=objects64 write) &&
midx_read_expect 1 63 5 objects64 " large-offsets"
'
test_done