git/t/t1051-large-conversion.sh

#!/bin/sh

test_description='test conversion filters on large files'

TEST_PASSES_SANITIZE_LEAK=true
. ./test-lib.sh

set_attr() {
	test_when_finished 'rm -f .gitattributes' &&
	echo "* $*" >.gitattributes
}

check_input() {
	git read-tree --empty &&
	git add small large &&
	git cat-file blob :small >small.index &&
	git cat-file blob :large | head -n 1 >large.index &&
	test_cmp small.index large.index
}

check_output() {
	rm -f small large &&
	git checkout small large &&
	head -n 1 large >large.head &&
	test_cmp small large.head
}

test_expect_success 'setup input tests' '
	printf "\$Id: foo\$\\r\\n" >small &&
	cat small small >large &&
	git config core.bigfilethreshold 20 &&
	git config filter.test.clean "sed s/.*/CLEAN/"
'

test_expect_success 'autocrlf=true converts on input' '
	test_config core.autocrlf true &&
	check_input
'

test_expect_success 'eol=crlf converts on input' '
	set_attr eol=crlf &&
	check_input
'

test_expect_success 'ident converts on input' '
	set_attr ident &&
	check_input
'

test_expect_success 'user-defined filters convert on input' '
	set_attr filter=test &&
	check_input
'

test_expect_success 'setup output tests' '
	echo "\$Id\$" >small &&
	cat small small >large &&
	git add small large &&
	git config core.bigfilethreshold 7 &&
	git config filter.test.smudge "sed s/.*/SMUDGE/"
'

test_expect_success 'autocrlf=true converts on output' '
	test_config core.autocrlf true &&
	check_output
'

test_expect_success 'eol=crlf converts on output' '
	set_attr eol=crlf &&
	check_output
'

test_expect_success 'user-defined filters convert on output' '
	set_attr filter=test &&
	check_output
'

test_expect_success 'ident converts on output' '
	set_attr ident &&
	rm -f small large &&
	git checkout small large &&
	sed -n "s/Id: .*/Id: SHA/p" <small >small.clean &&
	head -n 1 large >large.head &&
	sed -n "s/Id: .*/Id: SHA/p" <large.head >large.clean &&
	test_cmp small.clean large.clean
'

# This smudge filter prepends 5GB of zeros to the file it checks out. This
# ensures that smudging doesn't mangle large files on 64-bit Windows.
test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \
		'files over 4GB convert on output' '
	test_commit test small "a small file" &&
	small_size=$(test_file_size small) &&
	test_config filter.makelarge.smudge \
		"test-tool genzeros $((5*1024*1024*1024)) && cat" &&
	echo "small filter=makelarge" >.gitattributes &&
	rm small &&
	git checkout -- small &&
	size=$(test_file_size small) &&
	test "$size" -eq $((5 * 1024 * 1024 * 1024 + $small_size))
'

# This clean filter writes down the size of input it receives. By checking against
# the actual size, we ensure that cleaning doesn't mangle large files on 64-bit Windows.
test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \
		'files over 4GB convert on input' '
	test-tool genzeros $((5*1024*1024*1024)) >big &&
	test_config filter.checklarge.clean "wc -c >big.size" &&
	echo "big filter=checklarge" >.gitattributes &&
	git add big &&
	test $(test_file_size big) -eq $(cat big.size)
'

test_done
do not stream large files to pack when filters are in use Because git's object format requires us to specify the number of bytes in the object in its header, we must know the size before streaming a blob into the object database. This is not a problem when adding a regular file, as we can get the size from stat(). However, when filters are in use (such as autocrlf, or the ident, filter, or eol gitattributes), we have no idea what the ultimate size will be. The current code just punts on the whole issue and ignores filter configuration entirely for files larger than core.bigfilethreshold. This can generate confusing results if you use filters for large binary files, as the filter will suddenly stop working as the file goes over a certain size. Rather than try to handle unknown input sizes with streaming, this patch just turns off the streaming optimization when filters are in use. This has a slight performance regression in a very specific case: if you have autocrlf on, but no gitattributes, a large binary file will avoid the streaming code path because we don't know beforehand whether it will need conversion or not. But if you are handling large binary files, you should be marking them as such via attributes (or at least not using autocrlf, and instead marking your text files as such). And the flip side is that if you have a large _non_-binary file, there is a correctness improvement; before we did not apply the conversion at all. The first half of the new t1051 script covers these failures on input. The second half tests the matching output code paths. These already work correctly, and do not need any adjustment. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com> 2012-02-25 06:10:17 +08:00			`#!/bin/sh`

			`test_description='test conversion filters on large files'`
cat-file: fix a common "struct object_context" memory leak Fix a memory leak where "cat-file" will leak the "path" member. See e5fba602e59 (textconv: support for cat_file, 2010-06-15) for the code that introduced the offending get_oid_with_context() call (called get_sha1_with_context() at the time). As a result we can mark several tests as passing with SANITIZE=leak using "TEST_PASSES_SANITIZE_LEAK=true". As noted in dc944b65f1d (get_sha1_with_context: dynamically allocate oc->path, 2017-05-19) callers must free the "path" member. That same commit added the relevant free() to this function, but we weren't catching cases where we'd return early. Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> 2022-07-01 18:42:59 +08:00
			`TEST_PASSES_SANITIZE_LEAK=true`
do not stream large files to pack when filters are in use Because git's object format requires us to specify the number of bytes in the object in its header, we must know the size before streaming a blob into the object database. This is not a problem when adding a regular file, as we can get the size from stat(). However, when filters are in use (such as autocrlf, or the ident, filter, or eol gitattributes), we have no idea what the ultimate size will be. The current code just punts on the whole issue and ignores filter configuration entirely for files larger than core.bigfilethreshold. This can generate confusing results if you use filters for large binary files, as the filter will suddenly stop working as the file goes over a certain size. Rather than try to handle unknown input sizes with streaming, this patch just turns off the streaming optimization when filters are in use. This has a slight performance regression in a very specific case: if you have autocrlf on, but no gitattributes, a large binary file will avoid the streaming code path because we don't know beforehand whether it will need conversion or not. But if you are handling large binary files, you should be marking them as such via attributes (or at least not using autocrlf, and instead marking your text files as such). And the flip side is that if you have a large _non_-binary file, there is a correctness improvement; before we did not apply the conversion at all. The first half of the new t1051 script covers these failures on input. The second half tests the matching output code paths. These already work correctly, and do not need any adjustment. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com> 2012-02-25 06:10:17 +08:00			`. ./test-lib.sh`

			`set_attr() {`
			`test_when_finished 'rm -f .gitattributes' &&`
			`echo "* $*" >.gitattributes`
			`}`

			`check_input() {`
			`git read-tree --empty &&`
			`git add small large &&`
			`git cat-file blob :small >small.index &&`
			`git cat-file blob :large \| head -n 1 >large.index &&`
			`test_cmp small.index large.index`
			`}`

			`check_output() {`
			`rm -f small large &&`
			`git checkout small large &&`
			`head -n 1 large >large.head &&`
			`test_cmp small large.head`
			`}`

			`test_expect_success 'setup input tests' '`
			`printf "\$Id: foo\$\\r\\n" >small &&`
			`cat small small >large &&`
			`git config core.bigfilethreshold 20 &&`
			`git config filter.test.clean "sed s/.*/CLEAN/"`
			`'`

			`test_expect_success 'autocrlf=true converts on input' '`
			`test_config core.autocrlf true &&`
			`check_input`
			`'`

			`test_expect_success 'eol=crlf converts on input' '`
			`set_attr eol=crlf &&`
			`check_input`
			`'`

			`test_expect_success 'ident converts on input' '`
			`set_attr ident &&`
			`check_input`
			`'`

			`test_expect_success 'user-defined filters convert on input' '`
			`set_attr filter=test &&`
			`check_input`
			`'`

			`test_expect_success 'setup output tests' '`
			`echo "\$Id\$" >small &&`
			`cat small small >large &&`
			`git add small large &&`
			`git config core.bigfilethreshold 7 &&`
			`git config filter.test.smudge "sed s/.*/SMUDGE/"`
			`'`

			`test_expect_success 'autocrlf=true converts on output' '`
			`test_config core.autocrlf true &&`
			`check_output`
			`'`

			`test_expect_success 'eol=crlf converts on output' '`
			`set_attr eol=crlf &&`
			`check_output`
			`'`

			`test_expect_success 'user-defined filters convert on output' '`
			`set_attr filter=test &&`
			`check_output`
			`'`

			`test_expect_success 'ident converts on output' '`
			`set_attr ident &&`
			`rm -f small large &&`
			`git checkout small large &&`
			`sed -n "s/Id: .*/Id: SHA/p" <small >small.clean &&`
			`head -n 1 large >large.head &&`
			`sed -n "s/Id: .*/Id: SHA/p" <large.head >large.clean &&`
			`test_cmp small.clean large.clean`
			`'`

t1051: introduce a smudge filter test for extremely large files The filter system allows for alterations to file contents when they're added to the database or working tree. ("Smudge" when moving to the working tree; "clean" when moving to the database.) This is used natively to handle CRLF to LF conversions. It's also employed by Git-LFS to replace large files from the working tree with small tracking files in the repo and vice versa. Git reads the entire smudged file into memory to convert it into a "clean" form to be used in-core. While this is inefficient, there's a more insidious problem on some platforms due to inconsistency between using unsigned long and size_t for the same type of data (size of a file in bytes). On most 64-bit platforms, unsigned long is 64 bits, and size_t is typedef'd to unsigned long. On Windows, however, unsigned long is only 32 bits (and therefore on 64-bit Windows, size_t is typedef'd to unsigned long long in order to be 64 bits). Practically speaking, this means 64-bit Windows users of Git-LFS can't handle files larger than 2^32 bytes. Other 64-bit platforms don't suffer this limitation. This commit introduces a test exposing the issue; future commits make it pass. The test simulates the way Git-LFS works by having a tiny file checked into the repository and expanding it to a huge file on checkout. Helped-by: Johannes Schindelin <johannes.schindelin@gmx.de> Signed-off-by: Matt Cooper <vtbassmatt@gmail.com> Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> Signed-off-by: Junio C Hamano <gitster@pobox.com> 2021-11-02 23:46:07 +08:00			`# This smudge filter prepends 5GB of zeros to the file it checks out. This`
			`# ensures that smudging doesn't mangle large files on 64-bit Windows.`
odb: teach read_blob_entry to use size_t There is mixed use of size_t and unsigned long to deal with sizes in the codebase. Recall that Windows defines unsigned long as 32 bits even on 64-bit platforms, meaning that converting size_t to unsigned long narrows the range. This mostly doesn't cause a problem since Git rarely deals with files larger than 2^32 bytes. But adjunct systems such as Git LFS, which use smudge/clean filters to keep huge files out of the repository, may have huge file contents passed through some of the functions in entry.c and convert.c. On Windows, this results in a truncated file being written to the workdir. I traced this to one specific use of unsigned long in write_entry (and a similar instance in write_pc_item_to_fd for parallel checkout). That appeared to be for the call to read_blob_entry, which expects a pointer to unsigned long. By altering the signature of read_blob_entry to expect a size_t, write_entry can be switched to use size_t internally (which all of its callers and most of its callees already used). To avoid touching dozens of additional files, read_blob_entry uses a local unsigned long to call a chain of functions which aren't prepared to accept size_t. Helped-by: Johannes Schindelin <johannes.schindelin@gmx.de> Signed-off-by: Matt Cooper <vtbassmatt@gmail.com> Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> Signed-off-by: Junio C Hamano <gitster@pobox.com> 2021-11-02 23:46:08 +08:00			`test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \`
t1051: introduce a smudge filter test for extremely large files The filter system allows for alterations to file contents when they're added to the database or working tree. ("Smudge" when moving to the working tree; "clean" when moving to the database.) This is used natively to handle CRLF to LF conversions. It's also employed by Git-LFS to replace large files from the working tree with small tracking files in the repo and vice versa. Git reads the entire smudged file into memory to convert it into a "clean" form to be used in-core. While this is inefficient, there's a more insidious problem on some platforms due to inconsistency between using unsigned long and size_t for the same type of data (size of a file in bytes). On most 64-bit platforms, unsigned long is 64 bits, and size_t is typedef'd to unsigned long. On Windows, however, unsigned long is only 32 bits (and therefore on 64-bit Windows, size_t is typedef'd to unsigned long long in order to be 64 bits). Practically speaking, this means 64-bit Windows users of Git-LFS can't handle files larger than 2^32 bytes. Other 64-bit platforms don't suffer this limitation. This commit introduces a test exposing the issue; future commits make it pass. The test simulates the way Git-LFS works by having a tiny file checked into the repository and expanding it to a huge file on checkout. Helped-by: Johannes Schindelin <johannes.schindelin@gmx.de> Signed-off-by: Matt Cooper <vtbassmatt@gmail.com> Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> Signed-off-by: Junio C Hamano <gitster@pobox.com> 2021-11-02 23:46:07 +08:00			`'files over 4GB convert on output' '`
			`test_commit test small "a small file" &&`
			`small_size=$(test_file_size small) &&`
			`test_config filter.makelarge.smudge \`
			`"test-tool genzeros $((510241024*1024)) && cat" &&`
			`echo "small filter=makelarge" >.gitattributes &&`
			`rm small &&`
			`git checkout -- small &&`
			`size=$(test_file_size small) &&`
			`test "$size" -eq $((5 * 1024 * 1024 * 1024 + $small_size))`
			`'`

clean/smudge: allow clean filters to process extremely large files The filter system allows for alterations to file contents when they're moved between the database and the worktree. We already made sure that it is possible for smudge filters to produce contents that are larger than `unsigned long` can represent (which matters on systems where `unsigned long` is narrower than `size_t`, most notably 64-bit Windows). Now we make sure that clean filters can _consume_ contents that are larger than that. Note that this commit only allows clean filters' _input_ to be larger than can be represented by `unsigned long`. This change makes only a very minute dent into the much larger project to teach Git to use `size_t` instead of `unsigned long` wherever appropriate. Helped-by: Johannes Schindelin <johannes.schindelin@gmx.de> Signed-off-by: Matt Cooper <vtbassmatt@gmail.com> Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de> Signed-off-by: Junio C Hamano <gitster@pobox.com> 2021-11-02 23:46:11 +08:00			`# This clean filter writes down the size of input it receives. By checking against`
			`# the actual size, we ensure that cleaning doesn't mangle large files on 64-bit Windows.`
			`test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \`
			`'files over 4GB convert on input' '`
			`test-tool genzeros $((510241024*1024)) >big &&`
			`test_config filter.checklarge.clean "wc -c >big.size" &&`
			`echo "big filter=checklarge" >.gitattributes &&`
			`git add big &&`
			`test $(test_file_size big) -eq $(cat big.size)`
			`'`

do not stream large files to pack when filters are in use Because git's object format requires us to specify the number of bytes in the object in its header, we must know the size before streaming a blob into the object database. This is not a problem when adding a regular file, as we can get the size from stat(). However, when filters are in use (such as autocrlf, or the ident, filter, or eol gitattributes), we have no idea what the ultimate size will be. The current code just punts on the whole issue and ignores filter configuration entirely for files larger than core.bigfilethreshold. This can generate confusing results if you use filters for large binary files, as the filter will suddenly stop working as the file goes over a certain size. Rather than try to handle unknown input sizes with streaming, this patch just turns off the streaming optimization when filters are in use. This has a slight performance regression in a very specific case: if you have autocrlf on, but no gitattributes, a large binary file will avoid the streaming code path because we don't know beforehand whether it will need conversion or not. But if you are handling large binary files, you should be marking them as such via attributes (or at least not using autocrlf, and instead marking your text files as such). And the flip side is that if you have a large _non_-binary file, there is a correctness improvement; before we did not apply the conversion at all. The first half of the new t1051 script covers these failures on input. The second half tests the matching output code paths. These already work correctly, and do not need any adjustment. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com> 2012-02-25 06:10:17 +08:00			`test_done`