2023-04-11 15:41:56 +08:00
|
|
|
#include "git-compat-util.h"
|
2006-09-05 12:50:12 +08:00
|
|
|
#include "tag.h"
|
|
|
|
#include "commit.h"
|
2023-03-21 14:25:54 +08:00
|
|
|
#include "gettext.h"
|
2023-02-24 08:09:27 +08:00
|
|
|
#include "hex.h"
|
2006-09-05 12:50:12 +08:00
|
|
|
#include "tree.h"
|
|
|
|
#include "blob.h"
|
|
|
|
#include "diff.h"
|
|
|
|
#include "tree-walk.h"
|
|
|
|
#include "revision.h"
|
|
|
|
#include "list-objects.h"
|
2017-11-22 04:58:50 +08:00
|
|
|
#include "list-objects-filter.h"
|
|
|
|
#include "list-objects-filter-options.h"
|
2017-12-08 23:27:15 +08:00
|
|
|
#include "packfile.h"
|
2023-05-16 14:34:06 +08:00
|
|
|
#include "object-store-ll.h"
|
2018-10-18 08:39:15 +08:00
|
|
|
#include "trace.h"
|
list-objects: respect max_allowed_tree_depth
The tree traversal in list-objects.c, which is used by "rev-list
--objects", etc, uses recursion and may run out of stack space. Let's
teach it about the new core.maxTreeDepth config option.
We unfortunately can't return an error here, as this code doesn't
produce an error return at all. We'll die() instead, which matches the
behavior when we see an otherwise broken tree.
Note that this will also generally reject such deep trees from entering
the repository from a fetch or push, due to the use of rev-list in the
connectivity check. But it's not foolproof! We stop traversing when we
see an UNINTERESTING object, and the connectivity check marks existing
ref tips as UNINTERESTING. So imagine commit X has a tree
with maximum depth N. If you then create a new commit Y with a tree
entry "Y:subdir" that points to "X^{tree}", then the depth of Y will be
N+1. But a connectivity check running "git rev-list --objects Y --not X"
won't realize that; it will stop traversing at X^{tree}, since that was
already reachable.
So this will stop naive pushes of too-deep trees, but not carefully
crafted malicious ones. Doing it robustly and efficiently would require
caching the maximum depth of each tree (i.e., the longest path to any
leaf entry). That's much more complex and not strictly needed. If each
recursive algorithm limits itself already, then that's sufficient.
Blocking the objects from entering the repo would be a nice
belt-and-suspenders addition, but it's not worth the extra cost.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-08-31 14:22:03 +08:00
|
|
|
#include "environment.h"
|
2006-09-05 12:50:12 +08:00
|
|
|
|
2018-08-14 02:14:28 +08:00
|
|
|
struct traversal_context {
|
|
|
|
struct rev_info *revs;
|
|
|
|
show_object_fn show_object;
|
|
|
|
show_commit_fn show_commit;
|
|
|
|
void *show_data;
|
2019-06-28 06:54:05 +08:00
|
|
|
struct filter *filter;
|
list-objects: respect max_allowed_tree_depth
The tree traversal in list-objects.c, which is used by "rev-list
--objects", etc, uses recursion and may run out of stack space. Let's
teach it about the new core.maxTreeDepth config option.
We unfortunately can't return an error here, as this code doesn't
produce an error return at all. We'll die() instead, which matches the
behavior when we see an otherwise broken tree.
Note that this will also generally reject such deep trees from entering
the repository from a fetch or push, due to the use of rev-list in the
connectivity check. But it's not foolproof! We stop traversing when we
see an UNINTERESTING object, and the connectivity check marks existing
ref tips as UNINTERESTING. So imagine commit X has a tree
with maximum depth N. If you then create a new commit Y with a tree
entry "Y:subdir" that points to "X^{tree}", then the depth of Y will be
N+1. But a connectivity check running "git rev-list --objects Y --not X"
won't realize that; it will stop traversing at X^{tree}, since that was
already reachable.
So this will stop naive pushes of too-deep trees, but not carefully
crafted malicious ones. Doing it robustly and efficiently would require
caching the maximum depth of each tree (i.e., the longest path to any
leaf entry). That's much more complex and not strictly needed. If each
recursive algorithm limits itself already, then that's sufficient.
Blocking the objects from entering the repo would be a nice
belt-and-suspenders addition, but it's not worth the extra cost.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-08-31 14:22:03 +08:00
|
|
|
int depth;
|
2018-08-14 02:14:28 +08:00
|
|
|
};
|
|
|
|
|
2022-03-10 00:01:38 +08:00
|
|
|
static void show_commit(struct traversal_context *ctx,
|
|
|
|
struct commit *commit)
|
|
|
|
{
|
|
|
|
if (!ctx->show_commit)
|
|
|
|
return;
|
|
|
|
ctx->show_commit(commit, ctx->show_data);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void show_object(struct traversal_context *ctx,
|
|
|
|
struct object *object,
|
|
|
|
const char *name)
|
|
|
|
{
|
|
|
|
if (!ctx->show_object)
|
|
|
|
return;
|
list-objects: drop --unpacked non-commit objects from results
In git-rev-list(1), we describe the `--unpacked` option as:
Only useful with `--objects`; print the object IDs that are not in
packs.
This is true of commits, which we discard via get_commit_action(), but
not of the objects they reach. So if we ask for an --objects traversal
with --unpacked, we may get arbitrarily many objects which are indeed
packed.
I am nearly certain this behavior dates back to the introduction of
`--unpacked` via 12d2a18780 ("git rev-list --unpacked" shows only
unpacked commits, 2005-07-03), but I couldn't get that revision of Git
to compile for me. At least as early as v2.0.0 this has been subtly
broken:
$ git.compile --version
git version 2.0.0
$ git.compile rev-list --objects --all --unpacked
72791fe96c93f9ec5c311b8bc966ab349b3b5bbe
05713d991c18bbeef7e154f99660005311b5004d v1.0
153ed8b7719c6f5a68ce7ffc43133e95a6ac0fdb
8e4020bb5a8d8c873b25de15933e75cc0fc275df one
9200b628cf9dc883a85a7abc8d6e6730baee589c two
3e6b46e1b7e3b91acce99f6a823104c28aae0b58 unpacked.t
There, only the first, third, and sixth entries are loose, with the
remaining set of objects belonging to at least one pack.
The implications for this are relatively benign: bare 'git repack'
invocations which invoke pack-objects with --unpacked are impacted, and
at worst we'll store a few extra objects that should have been excluded.
Arguably changing this behavior is a backwards-incompatible change,
since it alters the set of objects emitted from rev-list queries with
`--objects` and `--unpacked`. But I argue that this change is still
sensible, since the existing implementation deviates from
clearly-written documentation.
The fix here is straightforward: avoid showing any non-commit objects
which are contained in packs by discarding them within list-objects.c,
before they are shown to the user. Note that similar treatment for
`list-objects.c::show_commit()` is not needed, since that case is
already handled by `revision.c::get_commit_action()`.
Co-authored-by: Jeff King <peff@peff.net>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-11-07 06:56:30 +08:00
|
|
|
if (ctx->revs->unpacked && has_object_pack(&object->oid))
|
|
|
|
return;
|
|
|
|
|
2022-03-10 00:01:38 +08:00
|
|
|
ctx->show_object(object, name, ctx->show_data);
|
|
|
|
}
|
|
|
|
|
2018-08-14 02:14:28 +08:00
|
|
|
static void process_blob(struct traversal_context *ctx,
|
2006-09-05 12:50:12 +08:00
|
|
|
struct blob *blob,
|
2016-02-12 06:26:44 +08:00
|
|
|
struct strbuf *path,
|
2018-08-14 02:14:28 +08:00
|
|
|
const char *name)
|
2006-09-05 12:50:12 +08:00
|
|
|
{
|
|
|
|
struct object *obj = &blob->object;
|
list-objects: pass full pathname to callbacks
When we find a blob at "a/b/c", we currently pass this to
our show_object_fn callbacks as two components: "a/b/" and
"c". Callbacks which want the full value then call
path_name(), which concatenates the two. But this is an
inefficient interface; the path is a strbuf, and we could
simply append "c" to it temporarily, then roll back the
length, without creating a new copy.
So we could improve this by teaching the callsites of
path_name() this trick (and there are only 3). But we can
also notice that no callback actually cares about the
broken-down representation, and simply pass each callback
the full path "a/b/c" as a string. The callback code becomes
even simpler, then, as we do not have to worry about freeing
an allocated buffer, nor rolling back our modification to
the strbuf.
This is theoretically less efficient, as some callbacks
would not bother to format the final path component. But in
practice this is not measurable. Since we use the same
strbuf over and over, our work to grow it is amortized, and
we really only pay to memcpy a few bytes.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-02-12 06:28:36 +08:00
|
|
|
size_t pathlen;
|
2019-06-28 06:54:05 +08:00
|
|
|
enum list_objects_filter_result r;
|
2006-09-05 12:50:12 +08:00
|
|
|
|
2018-08-14 02:14:28 +08:00
|
|
|
if (!ctx->revs->blob_objects)
|
2006-09-05 12:50:12 +08:00
|
|
|
return;
|
2008-02-19 04:47:56 +08:00
|
|
|
if (!obj)
|
|
|
|
die("bad blob object");
|
2006-09-05 12:50:12 +08:00
|
|
|
if (obj->flags & (UNINTERESTING | SEEN))
|
|
|
|
return;
|
list-objects: pass full pathname to callbacks
When we find a blob at "a/b/c", we currently pass this to
our show_object_fn callbacks as two components: "a/b/" and
"c". Callbacks which want the full value then call
path_name(), which concatenates the two. But this is an
inefficient interface; the path is a strbuf, and we could
simply append "c" to it temporarily, then roll back the
length, without creating a new copy.
So we could improve this by teaching the callsites of
path_name() this trick (and there are only 3). But we can
also notice that no callback actually cares about the
broken-down representation, and simply pass each callback
the full path "a/b/c" as a string. The callback code becomes
even simpler, then, as we do not have to worry about freeing
an allocated buffer, nor rolling back our modification to
the strbuf.
This is theoretically less efficient, as some callbacks
would not bother to format the final path component. But in
practice this is not measurable. Since we use the same
strbuf over and over, our work to grow it is amortized, and
we really only pay to memcpy a few bytes.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-02-12 06:28:36 +08:00
|
|
|
|
2017-12-08 23:27:15 +08:00
|
|
|
/*
|
|
|
|
* Pre-filter known-missing objects when explicitly requested.
|
|
|
|
* Otherwise, a missing object error message may be reported
|
|
|
|
* later (depending on other filtering criteria).
|
|
|
|
*
|
|
|
|
* Note that this "--exclude-promisor-objects" pre-filtering
|
|
|
|
* may cause the actual filter to report an incomplete list
|
|
|
|
* of missing objects.
|
|
|
|
*/
|
2018-08-14 02:14:28 +08:00
|
|
|
if (ctx->revs->exclude_promisor_objects &&
|
2023-03-28 21:58:50 +08:00
|
|
|
!repo_has_object_file(the_repository, &obj->oid) &&
|
2017-12-08 23:27:15 +08:00
|
|
|
is_promisor_object(&obj->oid))
|
|
|
|
return;
|
|
|
|
|
list-objects: pass full pathname to callbacks
When we find a blob at "a/b/c", we currently pass this to
our show_object_fn callbacks as two components: "a/b/" and
"c". Callbacks which want the full value then call
path_name(), which concatenates the two. But this is an
inefficient interface; the path is a strbuf, and we could
simply append "c" to it temporarily, then roll back the
length, without creating a new copy.
So we could improve this by teaching the callsites of
path_name() this trick (and there are only 3). But we can
also notice that no callback actually cares about the
broken-down representation, and simply pass each callback
the full path "a/b/c" as a string. The callback code becomes
even simpler, then, as we do not have to worry about freeing
an allocated buffer, nor rolling back our modification to
the strbuf.
This is theoretically less efficient, as some callbacks
would not bother to format the final path component. But in
practice this is not measurable. Since we use the same
strbuf over and over, our work to grow it is amortized, and
we really only pay to memcpy a few bytes.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-02-12 06:28:36 +08:00
|
|
|
pathlen = path->len;
|
|
|
|
strbuf_addstr(path, name);
|
2019-06-28 06:54:05 +08:00
|
|
|
r = list_objects_filter__filter_object(ctx->revs->repo,
|
|
|
|
LOFS_BLOB, obj,
|
|
|
|
path->buf, &path->buf[pathlen],
|
|
|
|
ctx->filter);
|
2017-11-22 04:58:50 +08:00
|
|
|
if (r & LOFR_MARK_SEEN)
|
|
|
|
obj->flags |= SEEN;
|
|
|
|
if (r & LOFR_DO_SHOW)
|
2022-03-10 00:01:38 +08:00
|
|
|
show_object(ctx, obj, path->buf);
|
list-objects: pass full pathname to callbacks
When we find a blob at "a/b/c", we currently pass this to
our show_object_fn callbacks as two components: "a/b/" and
"c". Callbacks which want the full value then call
path_name(), which concatenates the two. But this is an
inefficient interface; the path is a strbuf, and we could
simply append "c" to it temporarily, then roll back the
length, without creating a new copy.
So we could improve this by teaching the callsites of
path_name() this trick (and there are only 3). But we can
also notice that no callback actually cares about the
broken-down representation, and simply pass each callback
the full path "a/b/c" as a string. The callback code becomes
even simpler, then, as we do not have to worry about freeing
an allocated buffer, nor rolling back our modification to
the strbuf.
This is theoretically less efficient, as some callbacks
would not bother to format the final path component. But in
practice this is not measurable. Since we use the same
strbuf over and over, our work to grow it is amortized, and
we really only pay to memcpy a few bytes.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-02-12 06:28:36 +08:00
|
|
|
strbuf_setlen(path, pathlen);
|
2006-09-05 12:50:12 +08:00
|
|
|
}
|
|
|
|
|
2018-08-14 02:14:29 +08:00
|
|
|
static void process_tree(struct traversal_context *ctx,
|
2006-09-05 12:50:12 +08:00
|
|
|
struct tree *tree,
|
2010-12-17 21:26:47 +08:00
|
|
|
struct strbuf *base,
|
2018-08-14 02:14:29 +08:00
|
|
|
const char *name);
|
|
|
|
|
|
|
|
static void process_tree_contents(struct traversal_context *ctx,
|
|
|
|
struct tree *tree,
|
|
|
|
struct strbuf *base)
|
2006-09-05 12:50:12 +08:00
|
|
|
{
|
|
|
|
struct tree_desc desc;
|
|
|
|
struct name_entry entry;
|
2018-08-14 02:14:29 +08:00
|
|
|
enum interesting match = ctx->revs->diffopt.pathspec.nr == 0 ?
|
|
|
|
all_entries_interesting : entry_not_interesting;
|
|
|
|
|
2023-10-02 10:40:28 +08:00
|
|
|
init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size);
|
2018-08-14 02:14:29 +08:00
|
|
|
|
|
|
|
while (tree_entry(&desc, &entry)) {
|
|
|
|
if (match != all_entries_interesting) {
|
2018-11-19 00:47:57 +08:00
|
|
|
match = tree_entry_interesting(ctx->revs->repo->index,
|
2023-07-08 06:21:15 +08:00
|
|
|
&entry, base,
|
2018-08-14 02:14:29 +08:00
|
|
|
&ctx->revs->diffopt.pathspec);
|
|
|
|
if (match == all_entries_not_interesting)
|
|
|
|
break;
|
|
|
|
if (match == entry_not_interesting)
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2018-10-06 05:31:24 +08:00
|
|
|
if (S_ISDIR(entry.mode)) {
|
2019-01-15 08:39:44 +08:00
|
|
|
struct tree *t = lookup_tree(ctx->revs->repo, &entry.oid);
|
2019-04-10 10:13:19 +08:00
|
|
|
if (!t) {
|
|
|
|
die(_("entry '%s' in tree %s has tree mode, "
|
|
|
|
"but is not a tree"),
|
|
|
|
entry.path, oid_to_hex(&tree->object.oid));
|
|
|
|
}
|
2018-10-06 05:31:24 +08:00
|
|
|
t->object.flags |= NOT_USER_GIVEN;
|
list-objects: respect max_allowed_tree_depth
The tree traversal in list-objects.c, which is used by "rev-list
--objects", etc, uses recursion and may run out of stack space. Let's
teach it about the new core.maxTreeDepth config option.
We unfortunately can't return an error here, as this code doesn't
produce an error return at all. We'll die() instead, which matches the
behavior when we see an otherwise broken tree.
Note that this will also generally reject such deep trees from entering
the repository from a fetch or push, due to the use of rev-list in the
connectivity check. But it's not foolproof! We stop traversing when we
see an UNINTERESTING object, and the connectivity check marks existing
ref tips as UNINTERESTING. So imagine commit X has a tree
with maximum depth N. If you then create a new commit Y with a tree
entry "Y:subdir" that points to "X^{tree}", then the depth of Y will be
N+1. But a connectivity check running "git rev-list --objects Y --not X"
won't realize that; it will stop traversing at X^{tree}, since that was
already reachable.
So this will stop naive pushes of too-deep trees, but not carefully
crafted malicious ones. Doing it robustly and efficiently would require
caching the maximum depth of each tree (i.e., the longest path to any
leaf entry). That's much more complex and not strictly needed. If each
recursive algorithm limits itself already, then that's sufficient.
Blocking the objects from entering the repo would be a nice
belt-and-suspenders addition, but it's not worth the extra cost.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-08-31 14:22:03 +08:00
|
|
|
ctx->depth++;
|
2018-10-06 05:31:24 +08:00
|
|
|
process_tree(ctx, t, base, entry.path);
|
list-objects: respect max_allowed_tree_depth
The tree traversal in list-objects.c, which is used by "rev-list
--objects", etc, uses recursion and may run out of stack space. Let's
teach it about the new core.maxTreeDepth config option.
We unfortunately can't return an error here, as this code doesn't
produce an error return at all. We'll die() instead, which matches the
behavior when we see an otherwise broken tree.
Note that this will also generally reject such deep trees from entering
the repository from a fetch or push, due to the use of rev-list in the
connectivity check. But it's not foolproof! We stop traversing when we
see an UNINTERESTING object, and the connectivity check marks existing
ref tips as UNINTERESTING. So imagine commit X has a tree
with maximum depth N. If you then create a new commit Y with a tree
entry "Y:subdir" that points to "X^{tree}", then the depth of Y will be
N+1. But a connectivity check running "git rev-list --objects Y --not X"
won't realize that; it will stop traversing at X^{tree}, since that was
already reachable.
So this will stop naive pushes of too-deep trees, but not carefully
crafted malicious ones. Doing it robustly and efficiently would require
caching the maximum depth of each tree (i.e., the longest path to any
leaf entry). That's much more complex and not strictly needed. If each
recursive algorithm limits itself already, then that's sufficient.
Blocking the objects from entering the repo would be a nice
belt-and-suspenders addition, but it's not worth the extra cost.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-08-31 14:22:03 +08:00
|
|
|
ctx->depth--;
|
2018-10-06 05:31:24 +08:00
|
|
|
}
|
2018-08-14 02:14:29 +08:00
|
|
|
else if (S_ISGITLINK(entry.mode))
|
2022-12-13 19:12:09 +08:00
|
|
|
; /* ignore gitlink */
|
2018-10-06 05:31:24 +08:00
|
|
|
else {
|
2019-01-15 08:39:44 +08:00
|
|
|
struct blob *b = lookup_blob(ctx->revs->repo, &entry.oid);
|
2019-04-10 10:13:17 +08:00
|
|
|
if (!b) {
|
|
|
|
die(_("entry '%s' in tree %s has blob mode, "
|
|
|
|
"but is not a blob"),
|
|
|
|
entry.path, oid_to_hex(&tree->object.oid));
|
|
|
|
}
|
2018-10-06 05:31:24 +08:00
|
|
|
b->object.flags |= NOT_USER_GIVEN;
|
|
|
|
process_blob(ctx, b, base, entry.path);
|
|
|
|
}
|
2018-08-14 02:14:29 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-14 02:14:28 +08:00
|
|
|
static void process_tree(struct traversal_context *ctx,
|
2006-09-05 12:50:12 +08:00
|
|
|
struct tree *tree,
|
2010-12-17 21:26:47 +08:00
|
|
|
struct strbuf *base,
|
2018-08-14 02:14:28 +08:00
|
|
|
const char *name)
|
2006-09-05 12:50:12 +08:00
|
|
|
{
|
|
|
|
struct object *obj = &tree->object;
|
2018-08-14 02:14:28 +08:00
|
|
|
struct rev_info *revs = ctx->revs;
|
2010-12-17 21:26:47 +08:00
|
|
|
int baselen = base->len;
|
2019-06-28 06:54:05 +08:00
|
|
|
enum list_objects_filter_result r;
|
2018-10-06 05:31:23 +08:00
|
|
|
int failed_parse;
|
2006-09-05 12:50:12 +08:00
|
|
|
|
|
|
|
if (!revs->tree_objects)
|
|
|
|
return;
|
2008-02-19 04:47:56 +08:00
|
|
|
if (!obj)
|
|
|
|
die("bad tree object");
|
2006-09-05 12:50:12 +08:00
|
|
|
if (obj->flags & (UNINTERESTING | SEEN))
|
|
|
|
return;
|
bitmaps: don't recurse into trees already in the bitmap
If an object is already mentioned in a reachability bitmap we are
building, then by definition so are all of the objects it can reach. We
have an optimization to stop traversing commits when we see they are
already in the bitmap, but we don't do the same for trees.
It's generally unavoidable to recurse into trees for commits not yet
covered by bitmaps (since most commits generally do have unique
top-level trees). But they usually have subtrees that are shared with
other commits (i.e., all of the subtrees the commit _didn't_ touch). And
some of those commits (and their trees) may be covered by the bitmap.
Usually this isn't _too_ big a deal, because we'll visit those subtrees
only once in total for the whole walk. But if you have a large number of
unbitmapped commits, and if your tree is big, then you may end up
opening a lot of sub-trees for no good reason.
We can use the same optimization we do for commits here: when we are
about to open a tree, see if it's in the bitmap (either the one we are
building, or the "seen" bitmap which covers the UNINTERESTING side of
the bitmap when doing a set-difference).
This works especially well because we'll visit all commits before
hitting any trees. So even in a history like:
A -- B
if "A" has a bitmap on disk but "B" doesn't, we'll already have OR-ed in
the results from A before looking at B's tree (so we really will only
look at trees touched by B).
For most repositories, the timings produced by p5310 are unspectacular.
Here's linux.git:
Test HEAD^ HEAD
--------------------------------------------------------------------
5310.4: simulated clone 6.00(5.90+0.10) 5.98(5.90+0.08) -0.3%
5310.5: simulated fetch 2.98(5.45+0.18) 2.85(5.31+0.18) -4.4%
5310.7: rev-list (commits) 0.32(0.29+0.03) 0.33(0.30+0.03) +3.1%
5310.8: rev-list (objects) 1.48(1.44+0.03) 1.49(1.44+0.05) +0.7%
Any improvement there is within the noise (the +3.1% on test 7 has to be
noise, since we are not recursing into trees, and thus the new code
isn't even run). The results for git.git are likewise uninteresting.
But here are numbers from some other real-world repositories (that are
not public). This one's tree is comparable in size to linux.git, but has
~16k refs (and so less complete bitmap coverage):
Test HEAD^ HEAD
-------------------------------------------------------------------------
5310.4: simulated clone 38.34(39.86+0.74) 33.95(35.53+0.76) -11.5%
5310.5: simulated fetch 2.29(6.31+0.35) 2.20(5.97+0.41) -3.9%
5310.7: rev-list (commits) 0.99(0.86+0.13) 0.96(0.85+0.11) -3.0%
5310.8: rev-list (objects) 11.32(11.04+0.27) 6.59(6.37+0.21) -41.8%
And here's another with a very large tree (~340k entries), and a fairly
large number of refs (~10k):
Test HEAD^ HEAD
-------------------------------------------------------------------------
5310.3: simulated clone 53.83(54.71+1.54) 39.77(40.76+1.50) -26.1%
5310.4: simulated fetch 19.91(20.11+0.56) 19.79(19.98+0.67) -0.6%
5310.6: rev-list (commits) 0.54(0.44+0.11) 0.51(0.43+0.07) -5.6%
5310.7: rev-list (objects) 24.32(23.59+0.73) 9.85(9.49+0.36) -59.5%
This patch provides substantial improvements in these larger cases, and
have any drawbacks for smaller ones (the cost of the bitmap check is
quite small compared to an actual tree traversal).
Note that we have to add a version of revision.c's include_check
callback which handles non-commits. We could possibly consolidate this
into a single callback for all objects types, as there's only one user
of the feature which would need converted (pack-bitmap.c:should_include).
That would in theory let us avoid duplicating any logic. But when I
tried it, the code ended up much worse to read, with lots of repeated
"if it's a commit do this, otherwise do that". Having two separate
callbacks splits that naturally, and matches the existing split of
show_commit/show_object callbacks.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-06-14 20:05:44 +08:00
|
|
|
if (revs->include_check_obj &&
|
|
|
|
!revs->include_check_obj(&tree->object, revs->include_check_data))
|
|
|
|
return;
|
2018-10-06 05:31:23 +08:00
|
|
|
|
list-objects: respect max_allowed_tree_depth
The tree traversal in list-objects.c, which is used by "rev-list
--objects", etc, uses recursion and may run out of stack space. Let's
teach it about the new core.maxTreeDepth config option.
We unfortunately can't return an error here, as this code doesn't
produce an error return at all. We'll die() instead, which matches the
behavior when we see an otherwise broken tree.
Note that this will also generally reject such deep trees from entering
the repository from a fetch or push, due to the use of rev-list in the
connectivity check. But it's not foolproof! We stop traversing when we
see an UNINTERESTING object, and the connectivity check marks existing
ref tips as UNINTERESTING. So imagine commit X has a tree
with maximum depth N. If you then create a new commit Y with a tree
entry "Y:subdir" that points to "X^{tree}", then the depth of Y will be
N+1. But a connectivity check running "git rev-list --objects Y --not X"
won't realize that; it will stop traversing at X^{tree}, since that was
already reachable.
So this will stop naive pushes of too-deep trees, but not carefully
crafted malicious ones. Doing it robustly and efficiently would require
caching the maximum depth of each tree (i.e., the longest path to any
leaf entry). That's much more complex and not strictly needed. If each
recursive algorithm limits itself already, then that's sufficient.
Blocking the objects from entering the repo would be a nice
belt-and-suspenders addition, but it's not worth the extra cost.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-08-31 14:22:03 +08:00
|
|
|
if (ctx->depth > max_allowed_tree_depth)
|
|
|
|
die("exceeded maximum allowed tree depth");
|
|
|
|
|
2018-10-06 05:31:23 +08:00
|
|
|
failed_parse = parse_tree_gently(tree, 1);
|
|
|
|
if (failed_parse) {
|
add `ignore_missing_links` mode to revwalk
When pack-objects is computing the reachability bitmap to
serve a fetch request, it can erroneously die() if some of
the UNINTERESTING objects are not present. Upload-pack
throws away HAVE lines from the client for objects we do not
have, but we may have a tip object without all of its
ancestors (e.g., if the tip is no longer reachable and was
new enough to survive a `git prune`, but some of its
reachable objects did get pruned).
In the non-bitmap case, we do a revision walk with the HAVE
objects marked as UNINTERESTING. The revision walker
explicitly ignores errors in accessing UNINTERESTING commits
to handle this case (and we do not bother looking at
UNINTERESTING trees or blobs at all).
When we have bitmaps, however, the process is quite
different. The bitmap index for a pack-objects run is
calculated in two separate steps:
First, we perform an extensive walk from all the HAVEs to
find the full set of objects reachable from them. This walk
is usually optimized away because we are expected to hit an
object with a bitmap during the traversal, which allows us
to terminate early.
Secondly, we perform an extensive walk from all the WANTs,
which usually also terminates early because we hit a commit
with an existing bitmap.
Once we have the resulting bitmaps from the two walks, we
AND-NOT them together to obtain the resulting set of objects
we need to pack.
When we are walking the HAVE objects, the revision walker
does not know that we are walking it only to mark the
results as uninteresting. We strip out the UNINTERESTING flag,
because those objects _are_ interesting to us during the
first walk. We want to keep going to get a complete set of
reachable objects if we can.
We need some way to tell the revision walker that it's OK to
silently truncate the HAVE walk, just like it does for the
UNINTERESTING case. This patch introduces a new
`ignore_missing_links` flag to the `rev_info` struct, which
we set only for the HAVE walk.
It also adds tests to cover UNINTERESTING objects missing
from several positions: a missing blob, a missing tree, and
a missing parent commit. The missing blob already worked (as
we do not care about its contents at all), but the other two
cases caused us to die().
Note that there are a few cases we do not need to test:
1. We do not need to test a missing tree, with the blob
still present. Without the tree that refers to it, we
would not know that the blob is relevant to our walk.
2. We do not need to test a tip commit that is missing.
Upload-pack omits these for us (and in fact, we
complain even in the non-bitmap case if it fails to do
so).
Reported-by: Siddharth Agarwal <sid0@fb.com>
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-03-28 18:00:43 +08:00
|
|
|
if (revs->ignore_missing_links)
|
|
|
|
return;
|
2017-12-08 23:27:15 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Pre-filter known-missing tree objects when explicitly
|
|
|
|
* requested. This may cause the actual filter to report
|
|
|
|
* an incomplete list of missing objects.
|
|
|
|
*/
|
|
|
|
if (revs->exclude_promisor_objects &&
|
|
|
|
is_promisor_object(&obj->oid))
|
|
|
|
return;
|
|
|
|
|
2023-10-26 18:11:07 +08:00
|
|
|
if (!revs->do_not_die_on_missing_objects)
|
2018-10-06 05:31:23 +08:00
|
|
|
die("bad tree object %s", oid_to_hex(&obj->oid));
|
add `ignore_missing_links` mode to revwalk
When pack-objects is computing the reachability bitmap to
serve a fetch request, it can erroneously die() if some of
the UNINTERESTING objects are not present. Upload-pack
throws away HAVE lines from the client for objects we do not
have, but we may have a tip object without all of its
ancestors (e.g., if the tip is no longer reachable and was
new enough to survive a `git prune`, but some of its
reachable objects did get pruned).
In the non-bitmap case, we do a revision walk with the HAVE
objects marked as UNINTERESTING. The revision walker
explicitly ignores errors in accessing UNINTERESTING commits
to handle this case (and we do not bother looking at
UNINTERESTING trees or blobs at all).
When we have bitmaps, however, the process is quite
different. The bitmap index for a pack-objects run is
calculated in two separate steps:
First, we perform an extensive walk from all the HAVEs to
find the full set of objects reachable from them. This walk
is usually optimized away because we are expected to hit an
object with a bitmap during the traversal, which allows us
to terminate early.
Secondly, we perform an extensive walk from all the WANTs,
which usually also terminates early because we hit a commit
with an existing bitmap.
Once we have the resulting bitmaps from the two walks, we
AND-NOT them together to obtain the resulting set of objects
we need to pack.
When we are walking the HAVE objects, the revision walker
does not know that we are walking it only to mark the
results as uninteresting. We strip out the UNINTERESTING flag,
because those objects _are_ interesting to us during the
first walk. We want to keep going to get a complete set of
reachable objects if we can.
We need some way to tell the revision walker that it's OK to
silently truncate the HAVE walk, just like it does for the
UNINTERESTING case. This patch introduces a new
`ignore_missing_links` flag to the `rev_info` struct, which
we set only for the HAVE walk.
It also adds tests to cover UNINTERESTING objects missing
from several positions: a missing blob, a missing tree, and
a missing parent commit. The missing blob already worked (as
we do not care about its contents at all), but the other two
cases caused us to die().
Note that there are a few cases we do not need to test:
1. We do not need to test a missing tree, with the blob
still present. Without the tree that refers to it, we
would not know that the blob is relevant to our walk.
2. We do not need to test a tip commit that is missing.
Upload-pack omits these for us (and in fact, we
complain even in the non-bitmap case if it fails to do
so).
Reported-by: Siddharth Agarwal <sid0@fb.com>
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-03-28 18:00:43 +08:00
|
|
|
}
|
list-objects: convert name_path to a strbuf
The "struct name_path" data is examined in only two places:
we generate it in process_tree(), and we convert it to a
single string in path_name(). Everyone else just passes it
through to those functions.
We can further note that process_tree() already keeps a
single strbuf with the leading tree path, for use with
tree_entry_interesting().
Instead of building a separate name_path linked list, let's
just use the one we already build in "base". This reduces
the amount of code (especially tricky code in path_name()
which did not check for integer overflows caused by deep
or large pathnames).
It is also more efficient in some instances. Any time we
were using tree_entry_interesting, we were building up the
strbuf anyway, so this is an immediate and obvious win
there. In cases where we were not, we trade off storing
"pathname/" in a strbuf on the heap for each level of the
path, instead of two pointers and an int on the stack (with
one pointer into the tree object). On a 64-bit system, the
latter is 20 bytes; so if path components are less than that
on average, this has lower peak memory usage. In practice
it probably doesn't matter either way; we are already
holding in memory all of the tree objects leading up to each
pathname, and for normal-depth pathnames, we are only
talking about hundreds of bytes.
This patch leaves "struct name_path" as a thin wrapper
around the strbuf, to avoid disrupting callbacks. We should
fix them, but leaving it out makes this diff easier to view.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-02-12 06:26:18 +08:00
|
|
|
|
|
|
|
strbuf_addstr(base, name);
|
2019-06-28 06:54:05 +08:00
|
|
|
r = list_objects_filter__filter_object(ctx->revs->repo,
|
|
|
|
LOFS_BEGIN_TREE, obj,
|
|
|
|
base->buf, &base->buf[baselen],
|
|
|
|
ctx->filter);
|
2017-11-22 04:58:50 +08:00
|
|
|
if (r & LOFR_MARK_SEEN)
|
|
|
|
obj->flags |= SEEN;
|
|
|
|
if (r & LOFR_DO_SHOW)
|
2022-03-10 00:01:38 +08:00
|
|
|
show_object(ctx, obj, base->buf);
|
list-objects: convert name_path to a strbuf
The "struct name_path" data is examined in only two places:
we generate it in process_tree(), and we convert it to a
single string in path_name(). Everyone else just passes it
through to those functions.
We can further note that process_tree() already keeps a
single strbuf with the leading tree path, for use with
tree_entry_interesting().
Instead of building a separate name_path linked list, let's
just use the one we already build in "base". This reduces
the amount of code (especially tricky code in path_name()
which did not check for integer overflows caused by deep
or large pathnames).
It is also more efficient in some instances. Any time we
were using tree_entry_interesting, we were building up the
strbuf anyway, so this is an immediate and obvious win
there. In cases where we were not, we trade off storing
"pathname/" in a strbuf on the heap for each level of the
path, instead of two pointers and an int on the stack (with
one pointer into the tree object). On a 64-bit system, the
latter is 20 bytes; so if path components are less than that
on average, this has lower peak memory usage. In practice
it probably doesn't matter either way; we are already
holding in memory all of the tree objects leading up to each
pathname, and for normal-depth pathnames, we are only
talking about hundreds of bytes.
This patch leaves "struct name_path" as a thin wrapper
around the strbuf, to avoid disrupting callbacks. We should
fix them, but leaving it out makes this diff easier to view.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-02-12 06:26:18 +08:00
|
|
|
if (base->len)
|
|
|
|
strbuf_addch(base, '/');
|
2010-12-17 21:26:47 +08:00
|
|
|
|
2018-10-18 08:39:15 +08:00
|
|
|
if (r & LOFR_SKIP_TREE)
|
|
|
|
trace_printf("Skipping contents of tree %s...\n", base->buf);
|
|
|
|
else if (!failed_parse)
|
2018-10-06 05:31:23 +08:00
|
|
|
process_tree_contents(ctx, tree, base);
|
2006-09-05 12:50:12 +08:00
|
|
|
|
2019-06-28 06:54:05 +08:00
|
|
|
r = list_objects_filter__filter_object(ctx->revs->repo,
|
|
|
|
LOFS_END_TREE, obj,
|
|
|
|
base->buf, &base->buf[baselen],
|
|
|
|
ctx->filter);
|
|
|
|
if (r & LOFR_MARK_SEEN)
|
|
|
|
obj->flags |= SEEN;
|
|
|
|
if (r & LOFR_DO_SHOW)
|
2022-03-10 00:01:38 +08:00
|
|
|
show_object(ctx, obj, base->buf);
|
2017-11-22 04:58:50 +08:00
|
|
|
|
2010-12-17 21:26:47 +08:00
|
|
|
strbuf_setlen(base, baselen);
|
2013-06-06 06:37:39 +08:00
|
|
|
free_tree_buffer(tree);
|
2006-09-05 12:50:12 +08:00
|
|
|
}
|
|
|
|
|
2021-04-09 19:28:02 +08:00
|
|
|
static void process_tag(struct traversal_context *ctx,
|
|
|
|
struct tag *tag,
|
|
|
|
const char *name)
|
|
|
|
{
|
2021-04-12 21:37:35 +08:00
|
|
|
enum list_objects_filter_result r;
|
|
|
|
|
|
|
|
r = list_objects_filter__filter_object(ctx->revs->repo, LOFS_TAG,
|
|
|
|
&tag->object, NULL, NULL,
|
|
|
|
ctx->filter);
|
|
|
|
if (r & LOFR_MARK_SEEN)
|
|
|
|
tag->object.flags |= SEEN;
|
|
|
|
if (r & LOFR_DO_SHOW)
|
2022-03-10 00:01:38 +08:00
|
|
|
show_object(ctx, &tag->object, name);
|
2021-04-09 19:28:02 +08:00
|
|
|
}
|
|
|
|
|
2006-09-06 16:42:23 +08:00
|
|
|
static void mark_edge_parents_uninteresting(struct commit *commit,
|
|
|
|
struct rev_info *revs,
|
|
|
|
show_edge_fn show_edge)
|
|
|
|
{
|
|
|
|
struct commit_list *parents;
|
|
|
|
|
|
|
|
for (parents = commit->parents; parents; parents = parents->next) {
|
|
|
|
struct commit *parent = parents->item;
|
|
|
|
if (!(parent->object.flags & UNINTERESTING))
|
|
|
|
continue;
|
2023-03-28 21:58:48 +08:00
|
|
|
mark_tree_uninteresting(revs->repo,
|
|
|
|
repo_get_commit_tree(the_repository, parent));
|
2006-09-06 16:42:23 +08:00
|
|
|
if (revs->edge_hint && !(parent->object.flags & SHOWN)) {
|
|
|
|
parent->object.flags |= SHOWN;
|
|
|
|
show_edge(parent);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-01-17 02:25:58 +08:00
|
|
|
static void add_edge_parents(struct commit *commit,
|
|
|
|
struct rev_info *revs,
|
|
|
|
show_edge_fn show_edge,
|
|
|
|
struct oidset *set)
|
|
|
|
{
|
|
|
|
struct commit_list *parents;
|
|
|
|
|
|
|
|
for (parents = commit->parents; parents; parents = parents->next) {
|
|
|
|
struct commit *parent = parents->item;
|
2023-03-28 21:58:48 +08:00
|
|
|
struct tree *tree = repo_get_commit_tree(the_repository,
|
|
|
|
parent);
|
2019-01-17 02:25:58 +08:00
|
|
|
|
|
|
|
if (!tree)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
oidset_insert(set, &tree->object.oid);
|
|
|
|
|
|
|
|
if (!(parent->object.flags & UNINTERESTING))
|
|
|
|
continue;
|
|
|
|
tree->object.flags |= UNINTERESTING;
|
|
|
|
|
|
|
|
if (revs->edge_hint && !(parent->object.flags & SHOWN)) {
|
|
|
|
parent->object.flags |= SHOWN;
|
|
|
|
show_edge(parent);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void mark_edges_uninteresting(struct rev_info *revs,
|
|
|
|
show_edge_fn show_edge,
|
|
|
|
int sparse)
|
2006-09-06 16:42:23 +08:00
|
|
|
{
|
2013-08-16 17:52:06 +08:00
|
|
|
struct commit_list *list;
|
list-objects: mark more commits as edges in mark_edges_uninteresting
The purpose of edge commits is to let pack-objects know what objects
it can use as base, but does not need to include in the thin pack
because the other side is supposed to already have them. So far we
mark uninteresting parents of interesting commits as edges. But even
an unrelated uninteresting commit (that the other side has) may
become a good base for pack-objects and help produce more efficient
packs.
This is especially true for shallow clone, when the client issues a
fetch with a depth smaller or equal to the number of commits the
server is ahead of the client. For example, in this commit history
the client has up to "A" and the server has up to "B":
-------A---B
have--^ ^
/
want--+
If depth 1 is requested, the commit list to send to the client
includes only B. The way m_e_u is working, it checks if parent
commits of B are uninteresting, if so mark them as edges. Due to
shallow effect, commit B is grafted to have no parents and the
revision walker never sees A as the parent of B. In fact it marks no
edges at all in this simple case and sends everything B has to the
client even if it could have excluded what A and also the client
already have.
In a slightly different case where A is not a direct parent of B
(iow there are commits in between A and B), marking A as an edge can
still save some because B may still have stuff from the far ancestor
A.
There is another case from the earlier patch, when we deepen a ref
from C->E to A->E:
---A---B C---D---E
want--^ ^ ^
shallow-+ /
have-------+
In this case we need to send A and B to the client, and C (i.e. the
current shallow point that the client informs the server) is a very
good base because it's closet to A and B. Normal m_e_u won't recognize
C as an edge because it only looks back to parents (i.e. A<-B) not the
opposite way B->C even if C is already marked as uninteresting commit
by the previous patch.
This patch includes all uninteresting commits from command line as
edges and lets pack-objects decide what's best to do. The upside is we
have better chance of producing better packs in certain cases. The
downside is we may need to process some extra objects on the server
side.
For the shallow case on git.git, when the client is 5 commits behind
and does "fetch --depth=3", the result pack is 99.26 KiB instead of
4.92 MiB.
Reported-and-analyzed-by: Matthijs Kooijman <matthijs@stdin.nl>
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-08-16 17:52:07 +08:00
|
|
|
int i;
|
|
|
|
|
2019-01-17 02:25:58 +08:00
|
|
|
if (sparse) {
|
|
|
|
struct oidset set;
|
|
|
|
oidset_init(&set, 16);
|
|
|
|
|
|
|
|
for (list = revs->commits; list; list = list->next) {
|
|
|
|
struct commit *commit = list->item;
|
2023-03-28 21:58:48 +08:00
|
|
|
struct tree *tree = repo_get_commit_tree(the_repository,
|
|
|
|
commit);
|
2006-09-06 16:42:23 +08:00
|
|
|
|
2019-01-17 02:25:58 +08:00
|
|
|
if (commit->object.flags & UNINTERESTING)
|
|
|
|
tree->object.flags |= UNINTERESTING;
|
|
|
|
|
|
|
|
oidset_insert(&set, &tree->object.oid);
|
|
|
|
add_edge_parents(commit, revs, show_edge, &set);
|
|
|
|
}
|
|
|
|
|
|
|
|
mark_trees_uninteresting_sparse(revs->repo, &set);
|
|
|
|
oidset_clear(&set);
|
|
|
|
} else {
|
|
|
|
for (list = revs->commits; list; list = list->next) {
|
|
|
|
struct commit *commit = list->item;
|
|
|
|
if (commit->object.flags & UNINTERESTING) {
|
|
|
|
mark_tree_uninteresting(revs->repo,
|
2023-03-28 21:58:48 +08:00
|
|
|
repo_get_commit_tree(the_repository, commit));
|
2019-01-17 02:25:58 +08:00
|
|
|
if (revs->edge_hint_aggressive && !(commit->object.flags & SHOWN)) {
|
|
|
|
commit->object.flags |= SHOWN;
|
|
|
|
show_edge(commit);
|
|
|
|
}
|
|
|
|
continue;
|
list-objects: mark more commits as edges in mark_edges_uninteresting
The purpose of edge commits is to let pack-objects know what objects
it can use as base, but does not need to include in the thin pack
because the other side is supposed to already have them. So far we
mark uninteresting parents of interesting commits as edges. But even
an unrelated uninteresting commit (that the other side has) may
become a good base for pack-objects and help produce more efficient
packs.
This is especially true for shallow clone, when the client issues a
fetch with a depth smaller or equal to the number of commits the
server is ahead of the client. For example, in this commit history
the client has up to "A" and the server has up to "B":
-------A---B
have--^ ^
/
want--+
If depth 1 is requested, the commit list to send to the client
includes only B. The way m_e_u is working, it checks if parent
commits of B are uninteresting, if so mark them as edges. Due to
shallow effect, commit B is grafted to have no parents and the
revision walker never sees A as the parent of B. In fact it marks no
edges at all in this simple case and sends everything B has to the
client even if it could have excluded what A and also the client
already have.
In a slightly different case where A is not a direct parent of B
(iow there are commits in between A and B), marking A as an edge can
still save some because B may still have stuff from the far ancestor
A.
There is another case from the earlier patch, when we deepen a ref
from C->E to A->E:
---A---B C---D---E
want--^ ^ ^
shallow-+ /
have-------+
In this case we need to send A and B to the client, and C (i.e. the
current shallow point that the client informs the server) is a very
good base because it's closet to A and B. Normal m_e_u won't recognize
C as an edge because it only looks back to parents (i.e. A<-B) not the
opposite way B->C even if C is already marked as uninteresting commit
by the previous patch.
This patch includes all uninteresting commits from command line as
edges and lets pack-objects decide what's best to do. The upside is we
have better chance of producing better packs in certain cases. The
downside is we may need to process some extra objects on the server
side.
For the shallow case on git.git, when the client is 5 commits behind
and does "fetch --depth=3", the result pack is 99.26 KiB instead of
4.92 MiB.
Reported-and-analyzed-by: Matthijs Kooijman <matthijs@stdin.nl>
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-08-16 17:52:07 +08:00
|
|
|
}
|
2019-01-17 02:25:58 +08:00
|
|
|
mark_edge_parents_uninteresting(commit, revs, show_edge);
|
2006-09-06 16:42:23 +08:00
|
|
|
}
|
|
|
|
}
|
2019-01-17 02:25:58 +08:00
|
|
|
|
2014-12-25 07:05:39 +08:00
|
|
|
if (revs->edge_hint_aggressive) {
|
list-objects: only look at cmdline trees with edge_hint
When rev-list is given a command-line like:
git rev-list --objects $commit --not --all
the most accurate answer is the difference between the set
of objects reachable from $commit and the set reachable from
all of the existing refs. However, we have not historically
provided that answer, because it is very expensive to
calculate. We would have to open every tree of every commit
in the entire history.
Instead, we find the accurate set difference of the
reachable commits, and then mark the trees at the boundaries
as uninteresting. This misses objects which appear in the
trees of both the interesting commits and deep within the
uninteresting history.
Commit fbd4a70 (list-objects: mark more commits as edges in
mark_edges_uninteresting, 2013-08-16) noticed that we miss
those objects during pack-objects, and added code to examine
the trees of all of the "--not" refs given on the
command-line. Note that this is still not the complete set
difference, because we look only at the tips of the
command-line arguments, not all of their reachable commits.
But it increases the set of boundary objects we consider,
which is especially important for shallow fetches. So we
are trading extra CPU time for a larger set of boundary
objects, which can improve the resulting pack size for a
--thin pack.
This tradeoff probably makes sense in the context of
pack-objects, where we have set revs->edge_hint to have the
traversal feed us the set of boundary objects. For a
regular rev-list, though, it is probably not a good
tradeoff. It is true that it makes our list slightly closer
to a true set difference, but it is a rare case where this
is important. And because we do not have revs->edge_hint
set, we do nothing useful with the larger set of boundary
objects.
This patch therefore ties the extra tree examination to the
revs->edge_hint flag; it is the presence of that flag that
makes the tradeoff worthwhile.
Here is output from the p0001-rev-list showing the
improvement in performance:
Test HEAD^ HEAD
-----------------------------------------------------------------------------------------
0001.1: rev-list --all 0.69(0.65+0.02) 0.69(0.66+0.02) +0.0%
0001.2: rev-list --all --objects 3.22(3.19+0.03) 3.23(3.20+0.03) +0.3%
0001.4: rev-list $commit --not --all 0.04(0.04+0.00) 0.04(0.04+0.00) +0.0%
0001.5: rev-list --objects $commit --not --all 0.27(0.26+0.01) 0.04(0.04+0.00) -85.2%
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-01-21 10:25:40 +08:00
|
|
|
for (i = 0; i < revs->cmdline.nr; i++) {
|
|
|
|
struct object *obj = revs->cmdline.rev[i].item;
|
|
|
|
struct commit *commit = (struct commit *)obj;
|
|
|
|
if (obj->type != OBJ_COMMIT || !(obj->flags & UNINTERESTING))
|
|
|
|
continue;
|
2018-09-21 23:57:39 +08:00
|
|
|
mark_tree_uninteresting(revs->repo,
|
2023-03-28 21:58:48 +08:00
|
|
|
repo_get_commit_tree(the_repository, commit));
|
list-objects: only look at cmdline trees with edge_hint
When rev-list is given a command-line like:
git rev-list --objects $commit --not --all
the most accurate answer is the difference between the set
of objects reachable from $commit and the set reachable from
all of the existing refs. However, we have not historically
provided that answer, because it is very expensive to
calculate. We would have to open every tree of every commit
in the entire history.
Instead, we find the accurate set difference of the
reachable commits, and then mark the trees at the boundaries
as uninteresting. This misses objects which appear in the
trees of both the interesting commits and deep within the
uninteresting history.
Commit fbd4a70 (list-objects: mark more commits as edges in
mark_edges_uninteresting, 2013-08-16) noticed that we miss
those objects during pack-objects, and added code to examine
the trees of all of the "--not" refs given on the
command-line. Note that this is still not the complete set
difference, because we look only at the tips of the
command-line arguments, not all of their reachable commits.
But it increases the set of boundary objects we consider,
which is especially important for shallow fetches. So we
are trading extra CPU time for a larger set of boundary
objects, which can improve the resulting pack size for a
--thin pack.
This tradeoff probably makes sense in the context of
pack-objects, where we have set revs->edge_hint to have the
traversal feed us the set of boundary objects. For a
regular rev-list, though, it is probably not a good
tradeoff. It is true that it makes our list slightly closer
to a true set difference, but it is a rare case where this
is important. And because we do not have revs->edge_hint
set, we do nothing useful with the larger set of boundary
objects.
This patch therefore ties the extra tree examination to the
revs->edge_hint flag; it is the presence of that flag that
makes the tradeoff worthwhile.
Here is output from the p0001-rev-list showing the
improvement in performance:
Test HEAD^ HEAD
-----------------------------------------------------------------------------------------
0001.1: rev-list --all 0.69(0.65+0.02) 0.69(0.66+0.02) +0.0%
0001.2: rev-list --all --objects 3.22(3.19+0.03) 3.23(3.20+0.03) +0.3%
0001.4: rev-list $commit --not --all 0.04(0.04+0.00) 0.04(0.04+0.00) +0.0%
0001.5: rev-list --objects $commit --not --all 0.27(0.26+0.01) 0.04(0.04+0.00) -85.2%
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-01-21 10:25:40 +08:00
|
|
|
if (!(obj->flags & SHOWN)) {
|
|
|
|
obj->flags |= SHOWN;
|
|
|
|
show_edge(commit);
|
|
|
|
}
|
list-objects: mark more commits as edges in mark_edges_uninteresting
The purpose of edge commits is to let pack-objects know what objects
it can use as base, but does not need to include in the thin pack
because the other side is supposed to already have them. So far we
mark uninteresting parents of interesting commits as edges. But even
an unrelated uninteresting commit (that the other side has) may
become a good base for pack-objects and help produce more efficient
packs.
This is especially true for shallow clone, when the client issues a
fetch with a depth smaller or equal to the number of commits the
server is ahead of the client. For example, in this commit history
the client has up to "A" and the server has up to "B":
-------A---B
have--^ ^
/
want--+
If depth 1 is requested, the commit list to send to the client
includes only B. The way m_e_u is working, it checks if parent
commits of B are uninteresting, if so mark them as edges. Due to
shallow effect, commit B is grafted to have no parents and the
revision walker never sees A as the parent of B. In fact it marks no
edges at all in this simple case and sends everything B has to the
client even if it could have excluded what A and also the client
already have.
In a slightly different case where A is not a direct parent of B
(iow there are commits in between A and B), marking A as an edge can
still save some because B may still have stuff from the far ancestor
A.
There is another case from the earlier patch, when we deepen a ref
from C->E to A->E:
---A---B C---D---E
want--^ ^ ^
shallow-+ /
have-------+
In this case we need to send A and B to the client, and C (i.e. the
current shallow point that the client informs the server) is a very
good base because it's closet to A and B. Normal m_e_u won't recognize
C as an edge because it only looks back to parents (i.e. A<-B) not the
opposite way B->C even if C is already marked as uninteresting commit
by the previous patch.
This patch includes all uninteresting commits from command line as
edges and lets pack-objects decide what's best to do. The upside is we
have better chance of producing better packs in certain cases. The
downside is we may need to process some extra objects on the server
side.
For the shallow case on git.git, when the client is 5 commits behind
and does "fetch --depth=3", the result pack is 99.26 KiB instead of
4.92 MiB.
Reported-and-analyzed-by: Matthijs Kooijman <matthijs@stdin.nl>
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-08-16 17:52:07 +08:00
|
|
|
}
|
|
|
|
}
|
2006-09-06 16:42:23 +08:00
|
|
|
}
|
|
|
|
|
process_{tree,blob}: show objects without buffering
Here's a less trivial thing, and slightly more dubious one.
I was looking at that "struct object_array objects", and wondering why we
do that. I have honestly totally forgotten. Why not just call the "show()"
function as we encounter the objects? Rather than add the objects to the
object_array, and then at the very end going through the array and doing a
'show' on all, just do things more incrementally.
Now, there are possible downsides to this:
- the "buffer using object_array" _can_ in theory result in at least
better I-cache usage (two tight loops rather than one more spread out
one). I don't think this is a real issue, but in theory..
- this _does_ change the order of the objects printed. Instead of doing a
"process_tree(revs, commit->tree, &objects, NULL, "");" in the loop
over the commits (which puts all the root trees _first_ in the object
list, this patch just adds them to the list of pending objects, and
then we'll traverse them in that order (and thus show each root tree
object together with the objects we discover under it)
I _think_ the new ordering actually makes more sense, but the object
ordering is actually a subtle thing when it comes to packing
efficiency, so any change in order is going to have implications for
packing. Good or bad, I dunno.
- There may be some reason why we did it that odd way with the object
array, that I have simply forgotten.
Anyway, now that we don't buffer up the objects before showing them
that may actually result in lower memory usage during that whole
traverse_commit_list() phase.
This is seriously not very deeply tested. It makes sense to me, it seems
to pass all the tests, it looks ok, but...
Does anybody remember why we did that "object_array" thing? It used to be
an "object_list" a long long time ago, but got changed into the array due
to better memory usage patterns (those linked lists of obejcts are
horrible from a memory allocation standpoint). But I wonder why we didn't
do this back then. Maybe there's a reason for it.
Or maybe there _used_ to be a reason, and no longer is.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-04-11 08:27:58 +08:00
|
|
|
static void add_pending_tree(struct rev_info *revs, struct tree *tree)
|
|
|
|
{
|
|
|
|
add_pending_object(revs, &tree->object, "");
|
|
|
|
}
|
|
|
|
|
2021-08-12 16:59:31 +08:00
|
|
|
static void traverse_non_commits(struct traversal_context *ctx,
|
|
|
|
struct strbuf *base)
|
2006-09-05 12:50:12 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2017-11-03 03:41:43 +08:00
|
|
|
assert(base->len == 0);
|
|
|
|
|
2018-08-14 02:14:28 +08:00
|
|
|
for (i = 0; i < ctx->revs->pending.nr; i++) {
|
|
|
|
struct object_array_entry *pending = ctx->revs->pending.objects + i;
|
2006-09-05 12:50:12 +08:00
|
|
|
struct object *obj = pending->item;
|
|
|
|
const char *name = pending->name;
|
traverse_commit_list: support pending blobs/trees with paths
When we call traverse_commit_list, we may have trees and
blobs in the pending array. As we process these, we pass the
"name" field from the pending entry as the path of the
object within the tree (which then becomes the root path if
we recurse into subtrees).
When we set up the traversal in prepare_revision_walk,
though, the "name" field of any pending trees and blobs is
likely to be the ref at which we found the object. We would
not want to make this part of the path (e.g., doing so would
make "git rev-list --objects v2.6.11-tree" in linux.git show
paths like "v2.6.11-tree/Makefile", which is nonsensical).
Therefore prepare_revision_walk sets the name field of each
pending tree and blobs to the empty string.
However, this leaves no room for a caller who does know the
correct path of a pending object to propagate that
information to the revision walker. We can fix this by
making two related changes:
1. Use the "path" field as the path instead of the "name"
field in traverse_commit_list. If the path is not set,
default to "" (which is what we always ended up with in
the current code, because of prepare_revision_walk).
2. In prepare_revision_walk, make a complete copy of the
entry. This makes the path field available to the
walker (if there is one), solving our problem.
Leaving the name field intact is now OK, as we do not
use it as a path due to point (1) above (and we can use
it to make more meaningful error messages if we want).
We also make the original "mode" field available to the
walker, though it does not actually use it.
Note that we still re-add the pending objects and free the
old ones (so we may strdup the path and name only to free
the old ones). This could be made more efficient by simply
copying the object_array entries that we are keeping.
However, that would require more restructuring of the code,
and is not done here.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-10-16 06:43:19 +08:00
|
|
|
const char *path = pending->path;
|
2006-09-05 12:50:12 +08:00
|
|
|
if (obj->flags & (UNINTERESTING | SEEN))
|
|
|
|
continue;
|
|
|
|
if (obj->type == OBJ_TAG) {
|
2021-04-09 19:28:02 +08:00
|
|
|
process_tag(ctx, (struct tag *)obj, name);
|
2006-09-05 12:50:12 +08:00
|
|
|
continue;
|
|
|
|
}
|
traverse_commit_list: support pending blobs/trees with paths
When we call traverse_commit_list, we may have trees and
blobs in the pending array. As we process these, we pass the
"name" field from the pending entry as the path of the
object within the tree (which then becomes the root path if
we recurse into subtrees).
When we set up the traversal in prepare_revision_walk,
though, the "name" field of any pending trees and blobs is
likely to be the ref at which we found the object. We would
not want to make this part of the path (e.g., doing so would
make "git rev-list --objects v2.6.11-tree" in linux.git show
paths like "v2.6.11-tree/Makefile", which is nonsensical).
Therefore prepare_revision_walk sets the name field of each
pending tree and blobs to the empty string.
However, this leaves no room for a caller who does know the
correct path of a pending object to propagate that
information to the revision walker. We can fix this by
making two related changes:
1. Use the "path" field as the path instead of the "name"
field in traverse_commit_list. If the path is not set,
default to "" (which is what we always ended up with in
the current code, because of prepare_revision_walk).
2. In prepare_revision_walk, make a complete copy of the
entry. This makes the path field available to the
walker (if there is one), solving our problem.
Leaving the name field intact is now OK, as we do not
use it as a path due to point (1) above (and we can use
it to make more meaningful error messages if we want).
We also make the original "mode" field available to the
walker, though it does not actually use it.
Note that we still re-add the pending objects and free the
old ones (so we may strdup the path and name only to free
the old ones). This could be made more efficient by simply
copying the object_array entries that we are keeping.
However, that would require more restructuring of the code,
and is not done here.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-10-16 06:43:19 +08:00
|
|
|
if (!path)
|
|
|
|
path = "";
|
2006-09-05 12:50:12 +08:00
|
|
|
if (obj->type == OBJ_TREE) {
|
list-objects: respect max_allowed_tree_depth
The tree traversal in list-objects.c, which is used by "rev-list
--objects", etc, uses recursion and may run out of stack space. Let's
teach it about the new core.maxTreeDepth config option.
We unfortunately can't return an error here, as this code doesn't
produce an error return at all. We'll die() instead, which matches the
behavior when we see an otherwise broken tree.
Note that this will also generally reject such deep trees from entering
the repository from a fetch or push, due to the use of rev-list in the
connectivity check. But it's not foolproof! We stop traversing when we
see an UNINTERESTING object, and the connectivity check marks existing
ref tips as UNINTERESTING. So imagine commit X has a tree
with maximum depth N. If you then create a new commit Y with a tree
entry "Y:subdir" that points to "X^{tree}", then the depth of Y will be
N+1. But a connectivity check running "git rev-list --objects Y --not X"
won't realize that; it will stop traversing at X^{tree}, since that was
already reachable.
So this will stop naive pushes of too-deep trees, but not carefully
crafted malicious ones. Doing it robustly and efficiently would require
caching the maximum depth of each tree (i.e., the longest path to any
leaf entry). That's much more complex and not strictly needed. If each
recursive algorithm limits itself already, then that's sufficient.
Blocking the objects from entering the repo would be a nice
belt-and-suspenders addition, but it's not worth the extra cost.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-08-31 14:22:03 +08:00
|
|
|
ctx->depth = 0;
|
2018-08-14 02:14:28 +08:00
|
|
|
process_tree(ctx, (struct tree *)obj, base, path);
|
2006-09-05 12:50:12 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (obj->type == OBJ_BLOB) {
|
2018-08-14 02:14:28 +08:00
|
|
|
process_blob(ctx, (struct blob *)obj, base, path);
|
2006-09-05 12:50:12 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
die("unknown pending object %s (%s)",
|
2015-11-10 10:22:28 +08:00
|
|
|
oid_to_hex(&obj->oid), name);
|
2006-09-05 12:50:12 +08:00
|
|
|
}
|
2018-08-14 02:14:28 +08:00
|
|
|
object_array_clear(&ctx->revs->pending);
|
2017-11-03 03:41:43 +08:00
|
|
|
}
|
|
|
|
|
2018-08-14 02:14:28 +08:00
|
|
|
static void do_traverse(struct traversal_context *ctx)
|
2017-11-03 03:41:43 +08:00
|
|
|
{
|
|
|
|
struct commit *commit;
|
|
|
|
struct strbuf csp; /* callee's scratch pad */
|
|
|
|
strbuf_init(&csp, PATH_MAX);
|
|
|
|
|
2018-08-14 02:14:28 +08:00
|
|
|
while ((commit = get_revision(ctx->revs)) != NULL) {
|
2021-04-12 21:37:35 +08:00
|
|
|
enum list_objects_filter_result r;
|
|
|
|
|
|
|
|
r = list_objects_filter__filter_object(ctx->revs->repo,
|
|
|
|
LOFS_COMMIT, &commit->object,
|
|
|
|
NULL, NULL, ctx->filter);
|
|
|
|
|
2017-11-03 03:41:43 +08:00
|
|
|
/*
|
|
|
|
* an uninteresting boundary commit may not have its tree
|
|
|
|
* parsed yet, but we are not going to show them anyway
|
|
|
|
*/
|
list-objects: don't queue root trees unless revs->tree_objects is set
When traverse_commit_list() processes each commit, it queues the
commit's root tree in the pending array. Then, after all commits are
processed, it calls traverse_trees_and_blobs() to walk over the pending
list, calling process_tree() on each. But if revs->tree_objects is not
set, process_tree() just exists immediately!
We can save ourselves some work by not even bothering to queue these
trees in the first place. There are a few subtle points to make:
- we also detect commits with a NULL tree pointer here. But this isn't
an interesting check for broken commits, since the lookup_tree()
we'd have done during commit parsing doesn't actually check that we
have the tree on disk. So we're not losing any robustness.
- besides queueing, we also set the NOT_USER_GIVEN flag on the tree
object. This is used by the traverse_commit_list_filtered() variant.
But if we're not exploring trees, then we won't actually care about
this flag, which is used only inside process_tree() code-paths.
- queueing trees eventually leads to us queueing blobs, too. But we
don't need to check revs->blob_objects here. Even in the current
code, we still wouldn't find those blobs, because we'd never open up
the tree objects to list their contents.
- the user-visible impact to the caller is minimal. The pending trees
are all cleared by the time the function returns anyway, by
traverse_trees_and_blobs(). We do call a show_commit() callback,
which technically could be looking at revs->pending during the
callback. But it seems like a rather unlikely thing to do (if you
want the tree of the current commit, then accessing the tree struct
member is a lot simpler).
So this should be safe to do. Let's look at the benefits:
[before]
Benchmark #1: git -C linux rev-list HEAD >/dev/null
Time (mean ± σ): 7.651 s ± 0.021 s [User: 7.399 s, System: 0.252 s]
Range (min … max): 7.607 s … 7.683 s 10 runs
[after]
Benchmark #1: git -C linux rev-list HEAD >/dev/null
Time (mean ± σ): 7.593 s ± 0.023 s [User: 7.329 s, System: 0.264 s]
Range (min … max): 7.565 s … 7.634 s 10 runs
Not too impressive, but then we're really just avoiding sticking a
pointer into a growable array. But still, I'll take a free 0.75%
speedup.
Let's try it after running "git commit-graph write":
[before]
Benchmark #1: git -C linux rev-list HEAD >/dev/null
Time (mean ± σ): 1.458 s ± 0.011 s [User: 1.199 s, System: 0.259 s]
Range (min … max): 1.447 s … 1.481 s 10 runs
[after]
Benchmark #1: git -C linux rev-list HEAD >/dev/null
Time (mean ± σ): 1.126 s ± 0.023 s [User: 896.5 ms, System: 229.0 ms]
Range (min … max): 1.106 s … 1.181 s 10 runs
Now that's more like it. We saved over 22% of the total time. Part of
that is because the runtime is shorter overall, but the absolute
improvement is also much larger. What's going on?
When we fill in a commit struct using the commit graph, we don't bother
to set the tree pointer, and instead lazy-load it when somebody calls
get_commit_tree(). So we're not only skipping the pointer write to the
pending queue, but we're skipping the lazy-load of the tree entirely.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-09-12 09:11:37 +08:00
|
|
|
if (!ctx->revs->tree_objects)
|
|
|
|
; /* do not bother loading tree */
|
2023-10-27 15:59:29 +08:00
|
|
|
else if (ctx->revs->do_not_die_on_missing_objects &&
|
|
|
|
oidset_contains(&ctx->revs->missing_commits, &commit->object.oid))
|
|
|
|
;
|
2023-03-28 21:58:48 +08:00
|
|
|
else if (repo_get_commit_tree(the_repository, commit)) {
|
|
|
|
struct tree *tree = repo_get_commit_tree(the_repository,
|
|
|
|
commit);
|
2018-10-06 05:31:24 +08:00
|
|
|
tree->object.flags |= NOT_USER_GIVEN;
|
|
|
|
add_pending_tree(ctx->revs, tree);
|
2019-04-10 10:13:25 +08:00
|
|
|
} else if (commit->object.parsed) {
|
|
|
|
die(_("unable to load root tree for commit %s"),
|
|
|
|
oid_to_hex(&commit->object.oid));
|
2018-10-06 05:31:24 +08:00
|
|
|
}
|
2021-04-12 21:37:35 +08:00
|
|
|
|
|
|
|
if (r & LOFR_MARK_SEEN)
|
|
|
|
commit->object.flags |= SEEN;
|
|
|
|
if (r & LOFR_DO_SHOW)
|
2022-03-10 00:01:38 +08:00
|
|
|
show_commit(ctx, commit);
|
2017-11-16 10:00:35 +08:00
|
|
|
|
2018-08-14 02:14:28 +08:00
|
|
|
if (ctx->revs->tree_blobs_in_commit_order)
|
2017-11-16 10:00:35 +08:00
|
|
|
/*
|
|
|
|
* NEEDSWORK: Adding the tree and then flushing it here
|
|
|
|
* needs a reallocation for each commit. Can we pass the
|
|
|
|
* tree directory without allocation churn?
|
|
|
|
*/
|
2021-08-12 16:59:31 +08:00
|
|
|
traverse_non_commits(ctx, &csp);
|
2017-11-03 03:41:43 +08:00
|
|
|
}
|
2021-08-12 16:59:31 +08:00
|
|
|
traverse_non_commits(ctx, &csp);
|
2017-11-03 03:41:43 +08:00
|
|
|
strbuf_release(&csp);
|
2006-09-05 12:50:12 +08:00
|
|
|
}
|
2017-11-22 04:58:50 +08:00
|
|
|
|
|
|
|
void traverse_commit_list_filtered(
|
|
|
|
struct rev_info *revs,
|
|
|
|
show_commit_fn show_commit,
|
|
|
|
show_object_fn show_object,
|
|
|
|
void *show_data,
|
|
|
|
struct oidset *omitted)
|
|
|
|
{
|
2022-03-10 00:01:36 +08:00
|
|
|
struct traversal_context ctx = {
|
|
|
|
.revs = revs,
|
|
|
|
.show_object = show_object,
|
|
|
|
.show_commit = show_commit,
|
|
|
|
.show_data = show_data,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (revs->filter.choice)
|
|
|
|
ctx.filter = list_objects_filter__init(omitted, &revs->filter);
|
2018-08-14 02:14:28 +08:00
|
|
|
|
|
|
|
do_traverse(&ctx);
|
2022-03-10 00:01:36 +08:00
|
|
|
|
|
|
|
if (ctx.filter)
|
|
|
|
list_objects_filter__free(ctx.filter);
|
2017-11-22 04:58:50 +08:00
|
|
|
}
|