2005-04-19 02:39:48 +08:00
|
|
|
#ifndef OBJECT_H
|
|
|
|
#define OBJECT_H
|
|
|
|
|
2024-06-14 14:50:32 +08:00
|
|
|
#include "hash.h"
|
2018-08-16 01:54:05 +08:00
|
|
|
|
2018-06-29 09:22:15 +08:00
|
|
|
struct buffer_slab;
|
2023-04-23 04:17:20 +08:00
|
|
|
struct repository;
|
2018-06-29 09:22:15 +08:00
|
|
|
|
2018-05-09 03:37:24 +08:00
|
|
|
struct parsed_object_pool {
|
|
|
|
struct object **obj_hash;
|
|
|
|
int nr_objs, obj_hash_size;
|
2018-05-16 05:48:42 +08:00
|
|
|
|
|
|
|
/* TODO: migrate alloc_states to mem-pool? */
|
|
|
|
struct alloc_state *blob_state;
|
|
|
|
struct alloc_state *tree_state;
|
|
|
|
struct alloc_state *commit_state;
|
|
|
|
struct alloc_state *tag_state;
|
|
|
|
struct alloc_state *object_state;
|
2018-05-16 07:42:16 +08:00
|
|
|
|
|
|
|
/* parent substitutions from .git/info/grafts and .git/shallow */
|
|
|
|
struct commit_graft **grafts;
|
|
|
|
int grafts_alloc, grafts_nr;
|
2018-05-18 06:51:52 +08:00
|
|
|
|
|
|
|
int is_shallow;
|
|
|
|
struct stat_validity *shallow_stat;
|
|
|
|
char *alternate_shallow_file;
|
2018-05-18 06:51:53 +08:00
|
|
|
|
|
|
|
int commit_graft_prepared;
|
commit.c: don't persist substituted parents when unshallowing
Since 37b9dcabfc (shallow.c: use '{commit,rollback}_shallow_file',
2020-04-22), Git knows how to reset stat-validity checks for the
$GIT_DIR/shallow file, allowing it to change between a shallow and
non-shallow state in the same process (e.g., in the case of 'git fetch
--unshallow').
However, when $GIT_DIR/shallow changes, Git does not alter or remove any
grafts (nor substituted parents) in memory.
This comes up in a "git fetch --unshallow" with fetch.writeCommitGraph
set to true. Ordinarily in a shallow repository (and before 37b9dcabfc,
even in this case), commit_graph_compatible() would return false,
indicating that the repository should not be used to write a
commit-graphs (since commit-graph files cannot represent a shallow
history). But since 37b9dcabfc, in an --unshallow operation that check
succeeds.
Thus even though the repository isn't shallow any longer (that is, we
have all of the objects), the in-core representation of those objects
still has munged parents at the shallow boundaries. When the
commit-graph write proceeds, we use the incorrect parentage, producing
wrong results.
There are two ways for a user to work around this: either (1) set
'fetch.writeCommitGraph' to 'false', or (2) drop the commit-graph after
unshallowing.
One way to fix this would be to reset the parsed object pool entirely
(flushing the cache and thus preventing subsequent reads from modifying
their parents) after unshallowing. That would produce a problem when
callers have a now-stale reference to the old pool, and so this patch
implements a different approach. Instead, attach a new bit to the pool,
'substituted_parent', which indicates if the repository *ever* stored a
commit which had its parents modified (i.e., the shallow boundary
prior to unshallowing).
This bit needs to be sticky because all reads subsequent to modifying a
commit's parents are unreliable when unshallowing. Modify the check in
'commit_graph_compatible' to take this bit into account, and correctly
avoid generating commit-graphs in this case, thus solving the bug.
Helped-by: Derrick Stolee <dstolee@microsoft.com>
Helped-by: Jonathan Nieder <jrnieder@gmail.com>
Reported-by: Jay Conrod <jayconrod@google.com>
Reviewed-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-07-09 05:10:53 +08:00
|
|
|
int substituted_parent;
|
2018-06-29 09:22:15 +08:00
|
|
|
|
|
|
|
struct buffer_slab *buffer_slab;
|
2018-05-09 03:37:24 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct parsed_object_pool *parsed_object_pool_new(void);
|
|
|
|
void parsed_object_pool_clear(struct parsed_object_pool *o);
|
|
|
|
|
2005-04-19 02:39:48 +08:00
|
|
|
struct object_list {
|
|
|
|
struct object *item;
|
|
|
|
struct object_list *next;
|
|
|
|
};
|
|
|
|
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 08:42:35 +08:00
|
|
|
struct object_array {
|
|
|
|
unsigned int nr;
|
|
|
|
unsigned int alloc;
|
|
|
|
struct object_array_entry {
|
|
|
|
struct object *item;
|
object_array_entry: fix memory handling of the name field
Previously, the memory management of the object_array_entry::name
field was inconsistent and undocumented. object_array_entries are
ultimately created by a single function, add_object_array_with_mode(),
which has an argument "const char *name". This function used to
simply set the name field to reference the string pointed to by the
name parameter, and nobody on the object_array side ever freed the
memory. Thus, it assumed that the memory for the name field would be
managed by the caller, and that the lifetime of that string would be
at least as long as the lifetime of the object_array_entry. But
callers were inconsistent:
* Some passed pointers to constant strings or argv entries, which was
OK.
* Some passed pointers to newly-allocated memory, but didn't arrange
for the memory ever to be freed.
* Some passed the return value of sha1_to_hex(), which is a pointer to
a statically-allocated buffer that can be overwritten at any time.
* Some passed pointers to refnames that they received from a
for_each_ref()-type iteration, but the lifetimes of such refnames is
not guaranteed by the refs API.
Bring consistency to this mess by changing object_array to make its
own copy for the object_array_entry::name field and free this memory
when an object_array_entry is deleted from the array.
Many callers were passing the empty string as the name parameter, so
as a performance optimization, treat the empty string specially.
Instead of making a copy, store a pointer to a statically-allocated
empty string to object_array_entry::name. When deleting such an
entry, skip the free().
Change the callers that were already passing copies to
add_object_array_with_mode() to either skip the copy, or (if the
memory needed to be allocated anyway) freeing the memory itself.
A part of this commit effectively reverts
70d26c6e76 read_revisions_from_stdin: make copies for handle_revision_arg
because the copying introduced by that commit (which is still
necessary) is now done at a deeper level.
Signed-off-by: Michael Haggerty <mhagger@alum.mit.edu>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2013-05-25 17:08:14 +08:00
|
|
|
/*
|
|
|
|
* name or NULL. If non-NULL, the memory pointed to
|
|
|
|
* is owned by this object *except* if it points at
|
|
|
|
* object_array_slopbuf, which is a static copy of the
|
|
|
|
* empty string.
|
|
|
|
*/
|
|
|
|
char *name;
|
2014-10-16 06:42:57 +08:00
|
|
|
char *path;
|
2007-04-23 00:43:58 +08:00
|
|
|
unsigned mode;
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 08:42:35 +08:00
|
|
|
} *objects;
|
|
|
|
};
|
|
|
|
|
2021-09-27 20:54:25 +08:00
|
|
|
#define OBJECT_ARRAY_INIT { 0 }
|
2010-08-29 10:04:17 +08:00
|
|
|
|
2023-05-09 01:38:06 +08:00
|
|
|
void object_array_init(struct object_array *array);
|
|
|
|
|
2014-03-25 21:23:26 +08:00
|
|
|
/*
|
|
|
|
* object flag allocation:
|
2024-06-26 01:40:07 +08:00
|
|
|
* revision.h: 0---------10 15 23------27
|
fetch: teach independent negotiation (no packfile)
Currently, the packfile negotiation step within a Git fetch cannot be
done independent of sending the packfile, even though there is at least
one application wherein this is useful. Therefore, make it possible for
this negotiation step to be done independently. A subsequent commit will
use this for one such application - push negotiation.
This feature is for protocol v2 only. (An implementation for protocol v0
would require a separate implementation in the fetch, transport, and
transport helper code.)
In the protocol, the main hindrance towards independent negotiation is
that the server can unilaterally decide to send the packfile. This is
solved by a "wait-for-done" argument: the server will then wait for the
client to say "done". In practice, the client will never say it; instead
it will cease requests once it is satisfied.
In the client, the main change lies in the transport and transport
helper code. fetch_refs_via_pack() performs everything needed - protocol
version and capability checks, and the negotiation itself.
There are 2 code paths that do not go through fetch_refs_via_pack() that
needed to be individually excluded: the bundle transport (excluded
through requiring smart_options, which the bundle transport doesn't
support) and transport helpers that do not support takeover. If or when
we support independent negotiation for protocol v0, we will need to
modify these 2 code paths to support it. But for now, report failure if
independent negotiation is requested in these cases.
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-05-05 05:16:01 +08:00
|
|
|
* fetch-pack.c: 01 67
|
2018-06-15 06:54:28 +08:00
|
|
|
* negotiator/default.c: 2--5
|
2018-03-06 18:16:15 +08:00
|
|
|
* walker.c: 0-2
|
2018-07-21 00:33:13 +08:00
|
|
|
* upload-pack.c: 4 11-----14 16-----19
|
2018-03-06 18:16:15 +08:00
|
|
|
* builtin/blame.c: 12-13
|
|
|
|
* bisect.c: 16
|
|
|
|
* bundle.c: 16
|
http-push: ensure unforced pushes fail when data would be lost
When we push using the DAV-based protocol, the client is the one that
performs the ref updates and therefore makes the checks to see whether
an unforced push should be allowed. We make this check by determining
if either (a) we lack the object file for the old value of the ref or
(b) the new value of the ref is not newer than the old value, and in
either case, reject the push.
However, the ref_newer function, which performs this latter check, has
an odd behavior due to the reuse of certain object flags. Specifically,
it will incorrectly return false in its first invocation and then
correctly return true on a subsequent invocation. This occurs because
the object flags used by http-push.c are the same as those used by
commit-reach.c, which implements ref_newer, and one piece of code
misinterprets the flags set by the other.
Note that this does not occur in all cases. For example, if the example
used in the tests is changed to use one repository instead of two and
rewind the head to add a commit, the test passes and we correctly reject
the push. However, the example provided does trigger this behavior, and
the code has been broken in this way since at least Git 2.0.0.
To solve this problem, let's move the two sets of object flags so that
they don't overlap, since we're clearly using them at the same time.
The new set should not conflict with other usage because other users are
either builtin code (which is not compiled into git http-push) or
upload-pack (which we similarly do not use here).
Reported-by: Michael Ward <mward@smartsoftwareinc.com>
Signed-off-by: René Scharfe <l.s.r@web.de>
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-06-24 05:52:20 +08:00
|
|
|
* http-push.c: 11-----14
|
commit-graph: fix writing first commit-graph during fetch
The previous commit includes a failing test for an issue around
fetch.writeCommitGraph and fetching in a repo with a submodule. Here, we
fix that bug and set the test to "test_expect_success".
The problem arises with this set of commands when the remote repo at
<url> has a submodule. Note that --recurse-submodules is not needed to
demonstrate the bug.
$ git clone <url> test
$ cd test
$ git -c fetch.writeCommitGraph=true fetch origin
Computing commit graph generation numbers: 100% (12/12), done.
BUG: commit-graph.c:886: missing parent <hash1> for commit <hash2>
Aborted (core dumped)
As an initial fix, I converted the code in builtin/fetch.c that calls
write_commit_graph_reachable() to instead launch a "git commit-graph
write --reachable --split" process. That code worked, but is not how we
want the feature to work long-term.
That test did demonstrate that the issue must be something to do with
internal state of the 'git fetch' process.
The write_commit_graph() method in commit-graph.c ensures the commits we
plan to write are "closed under reachability" using close_reachable().
This method walks from the input commits, and uses the UNINTERESTING
flag to mark which commits have already been visited. This allows the
walk to take O(N) time, where N is the number of commits, instead of
O(P) time, where P is the number of paths. (The number of paths can be
exponential in the number of commits.)
However, the UNINTERESTING flag is used in lots of places in the
codebase. This flag usually means some barrier to stop a commit walk,
such as in revision-walking to compare histories. It is not often
cleared after the walk completes because the starting points of those
walks do not have the UNINTERESTING flag, and clear_commit_marks() would
stop immediately.
This is happening during a 'git fetch' call with a remote. The fetch
negotiation is comparing the remote refs with the local refs and marking
some commits as UNINTERESTING.
I tested running clear_commit_marks_many() to clear the UNINTERESTING
flag inside close_reachable(), but the tips did not have the flag, so
that did nothing.
It turns out that the calculate_changed_submodule_paths() method is at
fault. Thanks, Peff, for pointing out this detail! More specifically,
for each submodule, the collect_changed_submodules() runs a revision
walk to essentially do file-history on the list of submodules. That
revision walk marks commits UNININTERESTING if they are simplified away
by not changing the submodule.
Instead, I finally arrived on the conclusion that I should use a flag
that is not used in any other part of the code. In commit-reach.c, a
number of flags were defined for commit walk algorithms. The REACHABLE
flag seemed like it made the most sense, and it seems it was not
actually used in the file. The REACHABLE flag was used in early versions
of commit-reach.c, but was removed by 4fbcca4 (commit-reach: make
can_all_from_reach... linear, 2018-07-20).
Add the REACHABLE flag to commit-graph.c and use it instead of
UNINTERESTING in close_reachable(). This fixes the bug in manual
testing.
Reported-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Helped-by: Jeff King <peff@peff.net>
Helped-by: Szeder Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-10-24 21:40:42 +08:00
|
|
|
* commit-graph.c: 15
|
|
|
|
* commit-reach.c: 16-----19
|
2018-04-11 05:26:19 +08:00
|
|
|
* sha1-name.c: 20
|
2018-03-06 18:16:15 +08:00
|
|
|
* list-objects-filter.c: 21
|
commit-graph: reuse existing Bloom filters where possible
In an earlier commit, a bug was described where it's possible for Git to
produce non-murmur3 hashes when the platform's "char" type is signed,
and there are paths with characters whose highest bit is set (i.e. all
characters >= 0x80).
That patch allows the caller to control which version of Bloom filters
are read and written. However, even on platforms with a signed "char"
type, it is possible to reuse existing Bloom filters if and only if
there are no changed paths in any commit's first parent tree-diff whose
characters have their highest bit set.
When this is the case, we can reuse the existing filter without having
to compute a new one. This is done by marking trees which are known to
have (or not have) any such paths. When a commit's root tree is verified
to not have any such paths, we mark it as such and declare that the
commit's Bloom filter is reusable.
Note that this heuristic only goes in one direction. If neither a commit
nor its first parent have any paths in their trees with non-ASCII
characters, then we know for certain that a path with non-ASCII
characters will not appear in a tree-diff against that commit's first
parent. The reverse isn't necessarily true: just because the tree-diff
doesn't contain any such paths does not imply that no such paths exist
in either tree.
So we end up recomputing some Bloom filters that we don't strictly have
to (i.e. their bits are the same no matter which version of murmur3 we
use). But culling these out is impossible, since we'd have to perform
the full tree-diff, which is the same effort as computing the Bloom
filter from scratch.
But because we can cache our results in each tree's flag bits, we can
often avoid recomputing many filters, thereby reducing the time it takes
to run
$ git commit-graph write --changed-paths --reachable
when upgrading from v1 to v2 Bloom filters.
To benchmark this, let's generate a commit-graph in linux.git with v1
changed-paths in generation order[^1]:
$ git clone git@github.com:torvalds/linux.git
$ cd linux
$ git commit-graph write --reachable --changed-paths
$ graph=".git/objects/info/commit-graph"
$ mv $graph{,.bak}
Then let's time how long it takes to go from v1 to v2 filters (with and
without the upgrade path enabled), resetting the state of the
commit-graph each time:
$ git config commitGraph.changedPathsVersion 2
$ hyperfine -p 'cp -f $graph.bak $graph' -L v 0,1 \
'GIT_TEST_UPGRADE_BLOOM_FILTERS={v} git.compile commit-graph write --reachable --changed-paths'
On linux.git (where there aren't any non-ASCII paths), the timings
indicate that this patch represents a speed-up over recomputing all
Bloom filters from scratch:
Benchmark 1: GIT_TEST_UPGRADE_BLOOM_FILTERS=0 git.compile commit-graph write --reachable --changed-paths
Time (mean ± σ): 124.873 s ± 0.316 s [User: 124.081 s, System: 0.643 s]
Range (min … max): 124.621 s … 125.227 s 3 runs
Benchmark 2: GIT_TEST_UPGRADE_BLOOM_FILTERS=1 git.compile commit-graph write --reachable --changed-paths
Time (mean ± σ): 79.271 s ± 0.163 s [User: 74.611 s, System: 4.521 s]
Range (min … max): 79.112 s … 79.437 s 3 runs
Summary
'GIT_TEST_UPGRADE_BLOOM_FILTERS=1 git.compile commit-graph write --reachable --changed-paths' ran
1.58 ± 0.01 times faster than 'GIT_TEST_UPGRADE_BLOOM_FILTERS=0 git.compile commit-graph write --reachable --changed-paths'
On git.git, we do have some non-ASCII paths, giving us a more modest
improvement from 4.163 seconds to 3.348 seconds, for a 1.24x speed-up.
On my machine, the stats for git.git are:
- 8,285 Bloom filters computed from scratch
- 10 Bloom filters generated as empty
- 4 Bloom filters generated as truncated due to too many changed paths
- 65,114 Bloom filters were reused when transitioning from v1 to v2.
[^1]: Note that this is is important, since `--stdin-packs` or
`--stdin-commits` orders commits in the commit-graph by their pack
position (with `--stdin-packs`) or in the raw input (with
`--stdin-commits`).
Since we compute Bloom filters in the same order that commits appear
in the graph, we must see a commit's (first) parent before we process
the commit itself. This is only guaranteed to happen when sorting
commits by their generation number.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2024-06-26 01:40:11 +08:00
|
|
|
* bloom.c: 2122
|
2018-03-06 18:16:15 +08:00
|
|
|
* builtin/fsck.c: 0--3
|
2020-09-18 02:11:51 +08:00
|
|
|
* builtin/gc.c: 0
|
2018-03-06 18:16:15 +08:00
|
|
|
* builtin/index-pack.c: 2021
|
2022-03-03 06:27:23 +08:00
|
|
|
* reflog.c: 10--12
|
2018-05-19 13:28:28 +08:00
|
|
|
* builtin/show-branch.c: 0-------------------------------------------26
|
2018-03-06 18:16:15 +08:00
|
|
|
* builtin/unpack-objects.c: 2021
|
2024-05-24 05:26:29 +08:00
|
|
|
* pack-bitmap.h: 2122
|
2014-03-25 21:23:26 +08:00
|
|
|
*/
|
revision: reallocate TOPO_WALK object flags
The bit fields in struct object have an unfortunate layout. Here's what
pahole reports on x86_64 GNU/Linux:
struct object {
unsigned int parsed:1; /* 0: 0 4 */
unsigned int type:3; /* 0: 1 4 */
/* XXX 28 bits hole, try to pack */
/* Force alignment to the next boundary: */
unsigned int :0;
unsigned int flags:29; /* 4: 0 4 */
/* XXX 3 bits hole, try to pack */
struct object_id oid; /* 8 32 */
/* size: 40, cachelines: 1, members: 4 */
/* sum members: 32 */
/* sum bitfield members: 33 bits, bit holes: 2, sum bit holes: 31 bits */
/* last cacheline: 40 bytes */
};
Notice the 1+3+29=33 bits in bit fields and 28+3=31 bits in holes.
There are holes inside the flags bit field as well -- while some object
flags are used for more than one purpose, 22, 23 and 24 are still free.
Use 23 and 24 instead of 27 and 28 for TOPO_WALK_EXPLORED and
TOPO_WALK_INDEGREE. This allows us to reduce FLAG_BITS by one so that
all bitfields combined fit into a single 32-bit slot:
struct object {
unsigned int parsed:1; /* 0: 0 4 */
unsigned int type:3; /* 0: 1 4 */
unsigned int flags:28; /* 0: 4 4 */
struct object_id oid; /* 4 32 */
/* size: 36, cachelines: 1, members: 4 */
/* last cacheline: 36 bytes */
};
With this tight packing the size of struct object is reduced by 10%.
Other architectures probably benefit as well.
Signed-off-by: René Scharfe <l.s.r@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-06-24 21:05:38 +08:00
|
|
|
#define FLAG_BITS 28
|
Shrink "struct object" a bit
This shrinks "struct object" by a small amount, by getting rid of the
"struct type *" pointer and replacing it with a 3-bit bitfield instead.
In addition, we merge the bitfields and the "flags" field, which
incidentally should also remove a useless 4-byte padding from the object
when in 64-bit mode.
Now, our "struct object" is still too damn large, but it's now less
obviously bloated, and of the remaining fields, only the "util" (which is
not used by most things) is clearly something that should be eventually
discarded.
This shrinks the "git-rev-list --all" memory use by about 2.5% on the
kernel archive (and, perhaps more importantly, on the larger mozilla
archive). That may not sound like much, but I suspect it's more on a
64-bit platform.
There are other remaining inefficiencies (the parent lists, for example,
probably have horrible malloc overhead), but this was pretty obvious.
Most of the patch is just changing the comparison of the "type" pointer
from one of the constant string pointers to the appropriate new TYPE_xxx
small integer constant.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-15 07:45:13 +08:00
|
|
|
|
2023-02-24 08:09:30 +08:00
|
|
|
#define TYPE_BITS 3
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Values in this enum (except those outside the 3 bit range) are part
|
|
|
|
* of pack file format. See gitformat-pack(5) for more information.
|
|
|
|
*/
|
|
|
|
enum object_type {
|
|
|
|
OBJ_BAD = -1,
|
|
|
|
OBJ_NONE = 0,
|
|
|
|
OBJ_COMMIT = 1,
|
|
|
|
OBJ_TREE = 2,
|
|
|
|
OBJ_BLOB = 3,
|
|
|
|
OBJ_TAG = 4,
|
|
|
|
/* 5 for future expansion */
|
|
|
|
OBJ_OFS_DELTA = 6,
|
|
|
|
OBJ_REF_DELTA = 7,
|
|
|
|
OBJ_ANY,
|
|
|
|
OBJ_MAX
|
|
|
|
};
|
|
|
|
|
2023-04-11 15:41:55 +08:00
|
|
|
/* unknown mode (impossible combination S_IFIFO|S_IFCHR) */
|
|
|
|
#define S_IFINVALID 0030000
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A "directory link" is a link to another git directory.
|
|
|
|
*
|
|
|
|
* The value 0160000 is not normally a valid mode, and
|
|
|
|
* also just happens to be S_IFDIR + S_IFLNK
|
|
|
|
*/
|
|
|
|
#define S_IFGITLINK 0160000
|
|
|
|
#define S_ISGITLINK(m) (((m) & S_IFMT) == S_IFGITLINK)
|
|
|
|
|
|
|
|
#define S_ISSPARSEDIR(m) ((m) == S_IFDIR)
|
|
|
|
|
|
|
|
static inline enum object_type object_type(unsigned int mode)
|
|
|
|
{
|
|
|
|
return S_ISDIR(mode) ? OBJ_TREE :
|
|
|
|
S_ISGITLINK(mode) ? OBJ_COMMIT :
|
|
|
|
OBJ_BLOB;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define ce_permissions(mode) (((mode) & 0100) ? 0755 : 0644)
|
|
|
|
static inline unsigned int create_ce_mode(unsigned int mode)
|
|
|
|
{
|
|
|
|
if (S_ISLNK(mode))
|
|
|
|
return S_IFLNK;
|
|
|
|
if (S_ISSPARSEDIR(mode))
|
|
|
|
return S_IFDIR;
|
|
|
|
if (S_ISDIR(mode) || S_ISGITLINK(mode))
|
|
|
|
return S_IFGITLINK;
|
|
|
|
return S_IFREG | ce_permissions(mode);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int canon_mode(unsigned int mode)
|
|
|
|
{
|
|
|
|
if (S_ISREG(mode))
|
|
|
|
return S_IFREG | ce_permissions(mode);
|
|
|
|
if (S_ISLNK(mode))
|
|
|
|
return S_IFLNK;
|
|
|
|
if (S_ISDIR(mode))
|
|
|
|
return S_IFDIR;
|
|
|
|
return S_IFGITLINK;
|
|
|
|
}
|
|
|
|
|
2006-07-12 11:45:31 +08:00
|
|
|
/*
|
|
|
|
* The object type is stored in 3 bits.
|
|
|
|
*/
|
2005-04-19 02:39:48 +08:00
|
|
|
struct object {
|
|
|
|
unsigned parsed : 1;
|
Shrink "struct object" a bit
This shrinks "struct object" by a small amount, by getting rid of the
"struct type *" pointer and replacing it with a 3-bit bitfield instead.
In addition, we merge the bitfields and the "flags" field, which
incidentally should also remove a useless 4-byte padding from the object
when in 64-bit mode.
Now, our "struct object" is still too damn large, but it's now less
obviously bloated, and of the remaining fields, only the "util" (which is
not used by most things) is clearly something that should be eventually
discarded.
This shrinks the "git-rev-list --all" memory use by about 2.5% on the
kernel archive (and, perhaps more importantly, on the larger mozilla
archive). That may not sound like much, but I suspect it's more on a
64-bit platform.
There are other remaining inefficiencies (the parent lists, for example,
probably have horrible malloc overhead), but this was pretty obvious.
Most of the patch is just changing the comparison of the "type" pointer
from one of the constant string pointers to the appropriate new TYPE_xxx
small integer constant.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-15 07:45:13 +08:00
|
|
|
unsigned type : TYPE_BITS;
|
|
|
|
unsigned flags : FLAG_BITS;
|
2015-11-10 10:22:28 +08:00
|
|
|
struct object_id oid;
|
2005-04-19 02:39:48 +08:00
|
|
|
};
|
|
|
|
|
2019-04-29 16:28:14 +08:00
|
|
|
const char *type_name(unsigned int type);
|
|
|
|
int type_from_string_gently(const char *str, ssize_t, int gentle);
|
2014-09-10 21:52:44 +08:00
|
|
|
#define type_from_string(str) type_from_string_gently(str, -1, 0)
|
Shrink "struct object" a bit
This shrinks "struct object" by a small amount, by getting rid of the
"struct type *" pointer and replacing it with a 3-bit bitfield instead.
In addition, we merge the bitfields and the "flags" field, which
incidentally should also remove a useless 4-byte padding from the object
when in 64-bit mode.
Now, our "struct object" is still too damn large, but it's now less
obviously bloated, and of the remaining fields, only the "util" (which is
not used by most things) is clearly something that should be eventually
discarded.
This shrinks the "git-rev-list --all" memory use by about 2.5% on the
kernel archive (and, perhaps more importantly, on the larger mozilla
archive). That may not sound like much, but I suspect it's more on a
64-bit platform.
There are other remaining inefficiencies (the parent lists, for example,
probably have horrible malloc overhead), but this was pretty obvious.
Most of the patch is just changing the comparison of the "type" pointer
from one of the constant string pointers to the appropriate new TYPE_xxx
small integer constant.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-15 07:45:13 +08:00
|
|
|
|
2014-03-01 00:29:17 +08:00
|
|
|
/*
|
|
|
|
* Return the current number of buckets in the object hashmap.
|
|
|
|
*/
|
2019-04-29 16:28:14 +08:00
|
|
|
unsigned int get_max_object_index(void);
|
2014-03-01 00:29:17 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the object from the specified bucket in the object hashmap.
|
|
|
|
*/
|
2019-04-29 16:28:14 +08:00
|
|
|
struct object *get_indexed_object(unsigned int);
|
2006-06-19 02:45:02 +08:00
|
|
|
|
2008-09-11 03:22:35 +08:00
|
|
|
/*
|
|
|
|
* This can be used to see if we have heard of the object before, but
|
|
|
|
* it can return "yes we have, and here is a half-initialised object"
|
|
|
|
* for an object that we haven't loaded/parsed yet.
|
|
|
|
*
|
|
|
|
* When parsing a commit to create an in-core commit object, its
|
|
|
|
* parents list holds commit objects that represent its parents, but
|
|
|
|
* they are expected to be lazily initialized and do not know what
|
|
|
|
* their trees or parents are yet. When this function returns such a
|
|
|
|
* half-initialised objects, the caller is expected to initialize them
|
|
|
|
* by calling parse_object() on them.
|
|
|
|
*/
|
2019-06-20 15:41:14 +08:00
|
|
|
struct object *lookup_object(struct repository *r, const struct object_id *oid);
|
2005-04-19 02:39:48 +08:00
|
|
|
|
2019-06-20 15:41:21 +08:00
|
|
|
void *create_object(struct repository *r, const struct object_id *oid, void *obj);
|
2005-04-19 02:39:48 +08:00
|
|
|
|
2020-06-17 17:14:08 +08:00
|
|
|
void *object_as_type(struct object *obj, enum object_type type, int quiet);
|
add object_as_type helper for casting objects
When we call lookup_commit, lookup_tree, etc, the logic goes
something like:
1. Look for an existing object struct. If we don't have
one, allocate and return a new one.
2. Double check that any object we have is the expected
type (and complain and return NULL otherwise).
3. Convert an object with type OBJ_NONE (from a prior
call to lookup_unknown_object) to the expected type.
We can encapsulate steps 2 and 3 in a helper function which
checks whether we have the expected object type, converts
OBJ_NONE as appropriate, and returns the object.
Not only does this shorten the code, but it also provides
one central location for converting OBJ_NONE objects into
objects of other types. Future patches will use that to
enforce type-specific invariants.
Since this is a refactoring, we would want it to behave
exactly as the current code. It takes a little reasoning to
see that this is the case:
- for lookup_{commit,tree,etc} functions, we are just
pulling steps 2 and 3 into a function that does the same
thing.
- for the call in peel_object, we currently only do step 3
(but we want to consolidate it with the others, as
mentioned above). However, step 2 is a noop here, as the
surrounding conditional makes sure we have OBJ_NONE
(which we want to keep to avoid an extraneous call to
sha1_object_info).
- for the call in lookup_commit_reference_gently, we are
currently doing step 2 but not step 3. However, step 3
is a noop here. The object we got will have just come
from deref_tag, which must have figured out the type for
each object in order to know when to stop peeling.
Therefore the type will never be OBJ_NONE.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2014-07-13 14:42:03 +08:00
|
|
|
|
2023-10-02 10:40:18 +08:00
|
|
|
|
|
|
|
static inline const char *parse_mode(const char *str, uint16_t *modep)
|
|
|
|
{
|
|
|
|
unsigned char c;
|
|
|
|
unsigned int mode = 0;
|
|
|
|
|
|
|
|
if (*str == ' ')
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
while ((c = *str++) != ' ') {
|
|
|
|
if (c < '0' || c > '7')
|
|
|
|
return NULL;
|
|
|
|
mode = (mode << 3) + (c - '0');
|
|
|
|
}
|
|
|
|
*modep = mode;
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
2013-03-17 16:22:36 +08:00
|
|
|
/*
|
|
|
|
* Returns the object, having parsed it to find out what it is.
|
|
|
|
*
|
|
|
|
* Returns NULL if the object is missing or corrupt.
|
|
|
|
*/
|
parse_object(): allow skipping hash check
The parse_object() function checks the object hash of any object it
parses. This is a nice feature, as it means we may catch bit corruption
during normal use, rather than waiting for specific fsck operations.
But it also can be slow. It's particularly noticeable for blobs, where
except for the hash check, we could return without loading the object
contents at all. Now one may wonder what is the point of calling
parse_object() on a blob in the first place then, but usually it's not
intentional: we were fed an oid from somewhere, don't know the type, and
want an object struct. For commits and trees, the parsing is usually
helpful; we're about to look at the contents anyway. But this is less
true for blobs, where we may be collecting them as part of a
reachability traversal, etc, and don't actually care what's in them. And
blobs, of course, tend to be larger.
We don't want to just throw out the hash-checks for blobs, though. We do
depend on them in some circumstances (e.g., rev-list --verify-objects
uses parse_object() to check them). It's only the callers that know
how they're going to use the result. And so we can help them by
providing a special flag to skip the hash check.
We could just apply this to blobs, as they're going to be the main
source of performance improvement. But if a caller doesn't care about
checking the hash, we might as well skip it for other object types, too.
Even though we can't avoid reading the object contents, we can still
skip the actual hash computation.
If this seems like it is making Git a little bit less safe against
corruption, it may be. But it's part of a series of tradeoffs we're
already making. For instance, "rev-list --objects" does not open the
contents of blobs it prints. And when a commit graph is present, we skip
opening most commits entirely. The important thing will be to use this
flag in cases where it's safe to skip the check. For instance, when
serving a pack for a fetch, we know the client will fully index the
objects and do a connectivity check itself. There's little to be gained
from the server side re-hashing a blob itself. And indeed, most of the
time we don't! The revision machinery won't open up a blob reached by
traversal, but only one requested directly with a "want" line. So
applied properly, this new feature shouldn't make anything less safe in
practice.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-07 07:01:34 +08:00
|
|
|
enum parse_object_flags {
|
|
|
|
PARSE_OBJECT_SKIP_HASH_CHECK = 1 << 0,
|
upload-pack: free tree buffers after parsing
When a client sends us a "want" or "have" line, we call parse_object()
to get an object struct. If the object is a tree, then the parsed state
means that tree->buffer points to the uncompressed contents of the tree.
But we don't really care about it. We only really need to parse commits
and tags; for trees and blobs, the important output is just a "struct
object" with the correct type.
But much worse, we do not ever free that tree buffer. It's not leaked in
the traditional sense, in that we still have a pointer to it from the
global object hash. But if the client requests many trees, we'll hold
all of their contents in memory at the same time.
Nobody really noticed because it's rare for clients to directly request
a tree. It might happen for a lightweight tag pointing straight at a
tree, or it might happen for a "tree:depth" partial clone filling in
missing trees.
But it's also possible for a malicious client to request a lot of trees,
causing upload-pack's memory to balloon. For example, without this
patch, requesting every tree in git.git like:
pktline() {
local msg="$*"
printf "%04x%s\n" $((1+4+${#msg})) "$msg"
}
want_trees() {
pktline command=fetch
printf 0001
git cat-file --batch-all-objects --batch-check='%(objectname) %(objecttype)' |
while read oid type; do
test "$type" = "tree" || continue
pktline want $oid
done
pktline done
printf 0000
}
want_trees | GIT_PROTOCOL=version=2 valgrind --tool=massif ./git upload-pack . >/dev/null
shows a peak heap usage of ~3.7GB. Which is just about the sum of the
sizes of all of the uncompressed trees. For linux.git, it's closer to
17GB.
So the obvious thing to do is to call free_tree_buffer() after we
realize that we've parsed a tree. We know that upload-pack won't need it
later. But let's push the logic into parse_object_with_flags(), telling
it to discard the tree buffer immediately. There are two reasons for
this. One, all of the relevant call-sites already call the with_options
variant to pass the SKIP_HASH flag. So it actually ends up as less code
than manually free-ing in each spot. And two, it enables an extra
optimization that I'll discuss below.
I've touched all of the sites that currently use SKIP_HASH in
upload-pack. That drops the peak heap of the upload-pack invocation
above from 3.7GB to ~24MB.
I've also modified the caller in get_reference(); a partial clone
benefits from its use in pack-objects for the reasons given in
0bc2557951 (upload-pack: skip parse-object re-hashing of "want" objects,
2022-09-06), where we were measuring blob requests. But note that the
results of get_reference() are used for traversing, as well; so we
really would _eventually_ use the tree contents. That makes this at
first glance a space/time tradeoff: we won't hold all of the trees in
memory at once, but we'll have to reload them each when it comes time to
traverse.
And here's where our extra optimization comes in. If the caller is not
going to immediately look at the tree contents, and it doesn't care
about checking the hash, then parse_object() can simply skip loading the
tree entirely, just like we do for blobs! And now it's not a space/time
tradeoff in get_reference() anymore. It's just a lazy-load: we're
delaying reading the tree contents until it's time to actually traverse
them one by one.
And of course for upload-pack, this optimization means we never load the
trees at all, saving lots of CPU time. Timing the "every tree from
git.git" request above shows upload-pack dropping from 32 seconds of CPU
to 19 (the remainder is mostly due to pack-objects actually sending the
pack; timing just the upload-pack portion shows we go from 13s to
~0.28s).
These are all highly gamed numbers, of course. For real-world
partial-clone requests we're saving only a small bit of time in
practice. But it does help harden upload-pack against malicious
denial-of-service attacks.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2024-02-29 06:39:07 +08:00
|
|
|
PARSE_OBJECT_DISCARD_TREE = 1 << 1,
|
parse_object(): allow skipping hash check
The parse_object() function checks the object hash of any object it
parses. This is a nice feature, as it means we may catch bit corruption
during normal use, rather than waiting for specific fsck operations.
But it also can be slow. It's particularly noticeable for blobs, where
except for the hash check, we could return without loading the object
contents at all. Now one may wonder what is the point of calling
parse_object() on a blob in the first place then, but usually it's not
intentional: we were fed an oid from somewhere, don't know the type, and
want an object struct. For commits and trees, the parsing is usually
helpful; we're about to look at the contents anyway. But this is less
true for blobs, where we may be collecting them as part of a
reachability traversal, etc, and don't actually care what's in them. And
blobs, of course, tend to be larger.
We don't want to just throw out the hash-checks for blobs, though. We do
depend on them in some circumstances (e.g., rev-list --verify-objects
uses parse_object() to check them). It's only the callers that know
how they're going to use the result. And so we can help them by
providing a special flag to skip the hash check.
We could just apply this to blobs, as they're going to be the main
source of performance improvement. But if a caller doesn't care about
checking the hash, we might as well skip it for other object types, too.
Even though we can't avoid reading the object contents, we can still
skip the actual hash computation.
If this seems like it is making Git a little bit less safe against
corruption, it may be. But it's part of a series of tradeoffs we're
already making. For instance, "rev-list --objects" does not open the
contents of blobs it prints. And when a commit graph is present, we skip
opening most commits entirely. The important thing will be to use this
flag in cases where it's safe to skip the check. For instance, when
serving a pack for a fetch, we know the client will fully index the
objects and do a connectivity check itself. There's little to be gained
from the server side re-hashing a blob itself. And indeed, most of the
time we don't! The revision machinery won't open up a blob reached by
traversal, but only one requested directly with a "want" line. So
applied properly, this new feature shouldn't make anything less safe in
practice.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-07 07:01:34 +08:00
|
|
|
};
|
2018-06-29 09:22:19 +08:00
|
|
|
struct object *parse_object(struct repository *r, const struct object_id *oid);
|
parse_object(): allow skipping hash check
The parse_object() function checks the object hash of any object it
parses. This is a nice feature, as it means we may catch bit corruption
during normal use, rather than waiting for specific fsck operations.
But it also can be slow. It's particularly noticeable for blobs, where
except for the hash check, we could return without loading the object
contents at all. Now one may wonder what is the point of calling
parse_object() on a blob in the first place then, but usually it's not
intentional: we were fed an oid from somewhere, don't know the type, and
want an object struct. For commits and trees, the parsing is usually
helpful; we're about to look at the contents anyway. But this is less
true for blobs, where we may be collecting them as part of a
reachability traversal, etc, and don't actually care what's in them. And
blobs, of course, tend to be larger.
We don't want to just throw out the hash-checks for blobs, though. We do
depend on them in some circumstances (e.g., rev-list --verify-objects
uses parse_object() to check them). It's only the callers that know
how they're going to use the result. And so we can help them by
providing a special flag to skip the hash check.
We could just apply this to blobs, as they're going to be the main
source of performance improvement. But if a caller doesn't care about
checking the hash, we might as well skip it for other object types, too.
Even though we can't avoid reading the object contents, we can still
skip the actual hash computation.
If this seems like it is making Git a little bit less safe against
corruption, it may be. But it's part of a series of tradeoffs we're
already making. For instance, "rev-list --objects" does not open the
contents of blobs it prints. And when a commit graph is present, we skip
opening most commits entirely. The important thing will be to use this
flag in cases where it's safe to skip the check. For instance, when
serving a pack for a fetch, we know the client will fully index the
objects and do a connectivity check itself. There's little to be gained
from the server side re-hashing a blob itself. And indeed, most of the
time we don't! The revision machinery won't open up a blob reached by
traversal, but only one requested directly with a "want" line. So
applied properly, this new feature shouldn't make anything less safe in
practice.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-07 07:01:34 +08:00
|
|
|
struct object *parse_object_with_flags(struct repository *r,
|
|
|
|
const struct object_id *oid,
|
|
|
|
enum parse_object_flags flags);
|
2005-04-28 22:46:33 +08:00
|
|
|
|
2013-03-17 16:22:36 +08:00
|
|
|
/*
|
|
|
|
* Like parse_object, but will die() instead of returning NULL. If the
|
|
|
|
* "name" parameter is not NULL, it is included in the error message
|
object: convert parse_object* to take struct object_id
Make parse_object, parse_object_or_die, and parse_object_buffer take a
pointer to struct object_id. Remove the temporary variables inserted
earlier, since they are no longer necessary. Transform all of the
callers using the following semantic patch:
@@
expression E1;
@@
- parse_object(E1.hash)
+ parse_object(&E1)
@@
expression E1;
@@
- parse_object(E1->hash)
+ parse_object(E1)
@@
expression E1, E2;
@@
- parse_object_or_die(E1.hash, E2)
+ parse_object_or_die(&E1, E2)
@@
expression E1, E2;
@@
- parse_object_or_die(E1->hash, E2)
+ parse_object_or_die(E1, E2)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1.hash, E2, E3, E4, E5)
+ parse_object_buffer(&E1, E2, E3, E4, E5)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1->hash, E2, E3, E4, E5)
+ parse_object_buffer(E1, E2, E3, E4, E5)
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-07 06:10:38 +08:00
|
|
|
* (otherwise, the hex object ID is given).
|
2013-03-17 16:22:36 +08:00
|
|
|
*/
|
object: convert parse_object* to take struct object_id
Make parse_object, parse_object_or_die, and parse_object_buffer take a
pointer to struct object_id. Remove the temporary variables inserted
earlier, since they are no longer necessary. Transform all of the
callers using the following semantic patch:
@@
expression E1;
@@
- parse_object(E1.hash)
+ parse_object(&E1)
@@
expression E1;
@@
- parse_object(E1->hash)
+ parse_object(E1)
@@
expression E1, E2;
@@
- parse_object_or_die(E1.hash, E2)
+ parse_object_or_die(&E1, E2)
@@
expression E1, E2;
@@
- parse_object_or_die(E1->hash, E2)
+ parse_object_or_die(E1, E2)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1.hash, E2, E3, E4, E5)
+ parse_object_buffer(&E1, E2, E3, E4, E5)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1->hash, E2, E3, E4, E5)
+ parse_object_buffer(E1, E2, E3, E4, E5)
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-07 06:10:38 +08:00
|
|
|
struct object *parse_object_or_die(const struct object_id *oid, const char *name);
|
2013-03-17 16:22:36 +08:00
|
|
|
|
2006-09-16 04:30:02 +08:00
|
|
|
/* Given the result of read_sha1_file(), returns the object after
|
|
|
|
* parsing it. eaten_p indicates if the object has a borrowed copy
|
|
|
|
* of buffer and the caller should not free() it.
|
|
|
|
*/
|
2018-06-29 09:22:18 +08:00
|
|
|
struct object *parse_object_buffer(struct repository *r, const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p);
|
2006-09-16 04:30:02 +08:00
|
|
|
|
2021-06-23 00:05:31 +08:00
|
|
|
/*
|
|
|
|
* Allocate and return an object struct, even if you do not know the type of
|
|
|
|
* the object. The returned object may have its "type" field set to a real type
|
|
|
|
* (if somebody previously called lookup_blob(), etc), or it may be set to
|
|
|
|
* OBJ_NONE. In the latter case, subsequent calls to lookup_blob(), etc, will
|
|
|
|
* set the type field as appropriate.
|
|
|
|
*
|
|
|
|
* Use this when you do not know the expected type of an object and want to
|
|
|
|
* avoid parsing it for efficiency reasons. Try to avoid it otherwise; it
|
|
|
|
* may allocate excess memory, since the returned object must be as large as
|
|
|
|
* the maximum struct of any type.
|
|
|
|
*/
|
2021-04-13 15:16:36 +08:00
|
|
|
struct object *lookup_unknown_object(struct repository *r, const struct object_id *oid);
|
2005-08-03 07:45:48 +08:00
|
|
|
|
object.h: add lookup_object_by_type() function
In some cases it's useful for efficiency reasons to get the type of an
object before deciding whether to parse it, but we still want an object
struct. E.g., in reachable.c, bitmaps give us the type, but we just want
to mark flags on each object. Likewise, we may loop over every object
and only parse tags in order to peel them; checking the type first lets
us avoid parsing the non-tags.
But our lookup_blob(), etc, functions make getting an object struct
annoying: we have to call the right function for every type. And we
cannot just use the generic lookup_object(), because it only returns an
already-seen object; it won't allocate a new object struct.
Let's provide a function that dispatches to the correct lookup_*
function based on a run-time type. In fact, reachable.c already has such
a helper, so we'll just make that public.
I did change the return type from "void *" to "struct object *". While
the former is a clever way to avoid casting inside the function, it's
less safe and less informative to people reading the function
declaration.
The next commit will add a new caller.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-06-23 00:06:41 +08:00
|
|
|
/*
|
|
|
|
* Dispatch to the appropriate lookup_blob(), lookup_commit(), etc, based on
|
|
|
|
* "type".
|
|
|
|
*/
|
|
|
|
struct object *lookup_object_by_type(struct repository *r, const struct object_id *oid,
|
|
|
|
enum object_type type);
|
|
|
|
|
2024-05-17 16:18:59 +08:00
|
|
|
enum peel_status {
|
|
|
|
/* object was peeled successfully: */
|
|
|
|
PEEL_PEELED = 0,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* object cannot be peeled because the named object (or an
|
|
|
|
* object referred to by a tag in the peel chain), does not
|
|
|
|
* exist.
|
|
|
|
*/
|
|
|
|
PEEL_INVALID = -1,
|
|
|
|
|
|
|
|
/* object cannot be peeled because it is not a tag: */
|
|
|
|
PEEL_NON_TAG = -2,
|
|
|
|
|
|
|
|
/* ref_entry contains no peeled value because it is a symref: */
|
|
|
|
PEEL_IS_SYMREF = -3,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ref_entry cannot be peeled because it is broken (i.e., the
|
|
|
|
* symbolic reference cannot even be resolved to an object
|
|
|
|
* name):
|
|
|
|
*/
|
|
|
|
PEEL_BROKEN = -4
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Peel the named object; i.e., if the object is a tag, resolve the
|
|
|
|
* tag recursively until a non-tag is found. If successful, store the
|
|
|
|
* result to oid and return PEEL_PEELED. If the object is not a tag
|
|
|
|
* or is not valid, return PEEL_NON_TAG or PEEL_INVALID, respectively,
|
|
|
|
* and leave oid unchanged.
|
|
|
|
*/
|
2024-05-17 16:19:04 +08:00
|
|
|
enum peel_status peel_object(struct repository *r,
|
|
|
|
const struct object_id *name, struct object_id *oid);
|
2024-05-17 16:18:59 +08:00
|
|
|
|
2007-06-07 15:04:01 +08:00
|
|
|
struct object_list *object_list_insert(struct object *item,
|
2005-08-03 07:45:48 +08:00
|
|
|
struct object_list **list_p);
|
|
|
|
|
|
|
|
int object_list_contains(struct object_list *list, struct object *obj);
|
|
|
|
|
2020-02-13 10:16:33 +08:00
|
|
|
void object_list_free(struct object_list **list);
|
|
|
|
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 08:42:35 +08:00
|
|
|
/* Object array handling .. */
|
|
|
|
void add_object_array(struct object *obj, const char *name, struct object_array *array);
|
2014-10-16 06:42:57 +08:00
|
|
|
void add_object_array_with_path(struct object *obj, const char *name, struct object_array *array, unsigned mode, const char *path);
|
2013-05-25 17:08:08 +08:00
|
|
|
|
object_array: add and use `object_array_pop()`
In a couple of places, we pop objects off an object array `foo` by
decreasing `foo.nr`. We access `foo.nr` in many places, but most if not
all other times we do so read-only, e.g., as we iterate over the array.
But when we change `foo.nr` behind the array's back, it feels a bit
nasty and looks like it might leak memory.
Leaks happen if the popped element has an allocated `name` or `path`.
At the moment, that is not the case. Still, 1) the object array might
gain more fields that want to be freed, 2) a code path where we pop
might start using names or paths, 3) one of these code paths might be
copied to somewhere where we do, and 4) using a dedicated function for
popping is conceptually cleaner.
Introduce and use `object_array_pop()` instead. Release memory in the
new function. Document that popping an object leaves the associated
elements in limbo.
The converted places were identified by grepping for "\.nr\>" and
looking for "--".
Make the new function return NULL on an empty array. This is consistent
with `pop_commit()` and allows the following:
while ((o = object_array_pop(&foo)) != NULL) {
// do something
}
But as noted above, we don't need to go out of our way to avoid reading
`foo.nr`. This is probably more readable:
while (foo.nr) {
... o = object_array_pop(&foo);
// do something
}
The name of `object_array_pop()` does not quite align with
`add_object_array()`. That is unfortunate. On the other hand, it matches
`object_array_clear()`. Arguably it's `add_...` that is the odd one out,
since it reads like it's used to "add" an "object array". For that
reason, side with `object_array_clear()`.
Signed-off-by: Martin Ågren <martin.agren@gmail.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-09-23 07:34:53 +08:00
|
|
|
/*
|
|
|
|
* Returns NULL if the array is empty. Otherwise, returns the last object
|
|
|
|
* after removing its entry from the array. Other resources associated
|
|
|
|
* with that object are left in an unspecified state and should not be
|
|
|
|
* examined.
|
|
|
|
*/
|
|
|
|
struct object *object_array_pop(struct object_array *array);
|
|
|
|
|
2013-05-25 17:08:08 +08:00
|
|
|
typedef int (*object_array_each_func_t)(struct object_array_entry *, void *);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Apply want to each entry in array, retaining only the entries for
|
|
|
|
* which the function returns true. Preserve the order of the entries
|
|
|
|
* that are retained.
|
|
|
|
*/
|
|
|
|
void object_array_filter(struct object_array *array,
|
|
|
|
object_array_each_func_t want, void *cb_data);
|
|
|
|
|
2013-05-25 17:08:10 +08:00
|
|
|
/*
|
|
|
|
* Remove from array all but the first entry with a given name.
|
|
|
|
* Warning: this function uses an O(N^2) algorithm.
|
|
|
|
*/
|
|
|
|
void object_array_remove_duplicates(struct object_array *array);
|
Add "named object array" concept
We've had this notion of a "object_list" for a long time, which eventually
grew a "name" member because some users (notably git-rev-list) wanted to
name each object as it is generated.
That object_list is great for some things, but it isn't all that wonderful
for others, and the "name" member is generally not used by everybody.
This patch splits the users of the object_list array up into two: the
traditional list users, who want the list-like format, and who don't
actually use or want the name. And another class of users that really used
the list as an extensible array, and generally wanted to name the objects.
The patch is fairly straightforward, but it's also biggish. Most of it
really just cleans things up: switching the revision parsing and listing
over to the array makes things like the builtin-diff usage much simpler
(we now see exactly how many members the array has, and we don't get the
objects reversed from the order they were on the command line).
One of the main reasons for doing this at all is that the malloc overhead
of the simple object list was actually pretty high, and the array is just
a lot denser. So this patch brings down memory usage by git-rev-list by
just under 3% (on top of all the other memory use optimizations) on the
mozilla archive.
It does add more lines than it removes, and more importantly, it adds a
whole new infrastructure for maintaining lists of objects, but on the
other hand, the new dynamic array code is pretty obvious. The change to
builtin-diff-tree.c shows a fairly good example of why an array interface
is sometimes more natural, and just much simpler for everybody.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-06-20 08:42:35 +08:00
|
|
|
|
2014-10-16 06:34:34 +08:00
|
|
|
/*
|
|
|
|
* Remove any objects from the array, freeing all used memory; afterwards
|
|
|
|
* the array is ready to store more objects with add_object_array().
|
|
|
|
*/
|
|
|
|
void object_array_clear(struct object_array *array);
|
|
|
|
|
2012-03-29 15:21:21 +08:00
|
|
|
void clear_object_flags(unsigned flags);
|
|
|
|
|
2017-12-26 01:44:58 +08:00
|
|
|
/*
|
2020-10-31 20:46:08 +08:00
|
|
|
* Clear the specified object flags from all in-core commit objects from
|
|
|
|
* the specified repository.
|
2017-12-26 01:44:58 +08:00
|
|
|
*/
|
2020-10-31 20:46:08 +08:00
|
|
|
void repo_clear_commit_marks(struct repository *r, unsigned int flags);
|
2017-12-26 01:44:58 +08:00
|
|
|
|
2005-04-19 02:39:48 +08:00
|
|
|
#endif /* OBJECT_H */
|