aboutsummaryrefslogtreecommitdiffstats
path: root/t
diff options
context:
space:
mode:
authorDerrick Stolee <derrickstolee@github.com>2025-02-03 17:11:06 +0000
committerJunio C Hamano <gitster@pobox.com>2025-02-03 16:12:42 -0800
commitbff455576750bd013a3c87b15cc7086cb8c1eab0 (patch)
tree28c1c20f55e9da7b10e59b83d49e1ddcac775a08 /t
parent6840fe9ee29ab51ffd7d924c624dc62da22c50bf (diff)
downloadgit-bff455576750bd013a3c87b15cc7086cb8c1eab0.tar.gz
backfill: add --sparse option
One way to significantly reduce the cost of a Git clone and later fetches is to use a blobless partial clone and combine that with a sparse-checkout that reduces the paths that need to be populated in the working directory. Not only does this reduce the cost of clones and fetches, the sparse-checkout reduces the number of objects needed to download from a promisor remote. However, history investigations can be expensive as computing blob diffs will trigger promisor remote requests for one object at a time. This can be avoided by downloading the blobs needed for the given sparse-checkout using 'git backfill' and its new '--sparse' mode, at a time that the user is willing to pay that extra cost. Note that this is distinctly different from the '--filter=sparse:<oid>' option, as this assumes that the partial clone has all reachable trees and we are using client-side logic to avoid downloading blobs outside of the sparse-checkout cone. This avoids the server-side cost of walking trees while also achieving a similar goal. It also downloads in batches based on similar path names, presenting a resumable download if things are interrupted. This augments the path-walk API to have a possibly-NULL 'pl' member that may point to a 'struct pattern_list'. This could be more general than the sparse-checkout definition at HEAD, but 'git backfill --sparse' is currently the only consumer. Be sure to test this in both cone mode and not cone mode. Cone mode has the benefit that the path-walk can skip certain paths once they would expand beyond the sparse-checkout. Non-cone mode can describe the included files using both positive and negative patterns, which changes the possible return values of path_matches_pattern_list(). Test both kinds of matches for increased coverage. To test this, we can create a blobless sparse clone, expand the sparse-checkout slightly, and then run 'git backfill --sparse' to see how much data is downloaded. The general steps are 1. git clone --filter=blob:none --sparse <url> 2. git sparse-checkout set <dir1> ... <dirN> 3. git backfill --sparse For the Git repository with the 'builtin' directory in the sparse-checkout, we get these results for various batch sizes: | Batch Size | Pack Count | Pack Size | Time | |-----------------|------------|-----------|-------| | (Initial clone) | 3 | 110 MB | | | 10K | 12 | 192 MB | 17.2s | | 15K | 9 | 192 MB | 15.5s | | 20K | 8 | 192 MB | 15.5s | | 25K | 7 | 192 MB | 14.7s | This case matters less because a full clone of the Git repository from GitHub is currently at 277 MB. Using a copy of the Linux repository with the 'kernel/' directory in the sparse-checkout, we get these results: | Batch Size | Pack Count | Pack Size | Time | |-----------------|------------|-----------|------| | (Initial clone) | 2 | 1,876 MB | | | 10K | 11 | 2,187 MB | 46s | | 25K | 7 | 2,188 MB | 43s | | 50K | 5 | 2,194 MB | 44s | | 100K | 4 | 2,194 MB | 48s | This case is more meaningful because a full clone of the Linux repository is currently over 6 GB, so this is a valuable way to download a fraction of the repository and no longer need network access for all reachable objects within the sparse-checkout. Choosing a batch size will depend on a lot of factors, including the user's network speed or reliability, the repository's file structure, and how many versions there are of the file within the sparse-checkout scope. There will not be a one-size-fits-all solution. Signed-off-by: Derrick Stolee <stolee@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 't')
-rw-r--r--t/helper/test-path-walk.c22
-rwxr-xr-xt/t5620-backfill.sh88
-rwxr-xr-xt/t6601-path-walk.sh32
3 files changed, 141 insertions, 1 deletions
diff --git a/t/helper/test-path-walk.c b/t/helper/test-path-walk.c
index 7f2d409c5b..61e845e5ec 100644
--- a/t/helper/test-path-walk.c
+++ b/t/helper/test-path-walk.c
@@ -1,6 +1,7 @@
#define USE_THE_REPOSITORY_VARIABLE
#include "test-tool.h"
+#include "dir.h"
#include "environment.h"
#include "hex.h"
#include "object-name.h"
@@ -9,6 +10,7 @@
#include "revision.h"
#include "setup.h"
#include "parse-options.h"
+#include "strbuf.h"
#include "path-walk.h"
#include "oid-array.h"
@@ -65,7 +67,7 @@ static int emit_block(const char *path, struct oid_array *oids,
int cmd__path_walk(int argc, const char **argv)
{
- int res;
+ int res, stdin_pl = 0;
struct rev_info revs = REV_INFO_INIT;
struct path_walk_info info = PATH_WALK_INFO_INIT;
struct path_walk_test_data data = { 0 };
@@ -80,6 +82,8 @@ int cmd__path_walk(int argc, const char **argv)
N_("toggle inclusion of tree objects")),
OPT_BOOL(0, "prune", &info.prune_all_uninteresting,
N_("toggle pruning of uninteresting paths")),
+ OPT_BOOL(0, "stdin-pl", &stdin_pl,
+ N_("read a pattern list over stdin")),
OPT_END(),
};
@@ -99,6 +103,17 @@ int cmd__path_walk(int argc, const char **argv)
info.path_fn = emit_block;
info.path_fn_data = &data;
+ if (stdin_pl) {
+ struct strbuf in = STRBUF_INIT;
+ CALLOC_ARRAY(info.pl, 1);
+
+ info.pl->use_cone_patterns = 1;
+
+ strbuf_fread(&in, 2048, stdin);
+ add_patterns_from_buffer(in.buf, in.len, "", 0, info.pl);
+ strbuf_release(&in);
+ }
+
res = walk_objects_by_path(&info);
printf("commits:%" PRIuMAX "\n"
@@ -107,6 +122,11 @@ int cmd__path_walk(int argc, const char **argv)
"tags:%" PRIuMAX "\n",
data.commit_nr, data.tree_nr, data.blob_nr, data.tag_nr);
+ if (info.pl) {
+ clear_pattern_list(info.pl);
+ free(info.pl);
+ }
+
release_revisions(&revs);
return res;
}
diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh
index 36107a51c5..6b72e9d0e3 100755
--- a/t/t5620-backfill.sh
+++ b/t/t5620-backfill.sh
@@ -77,6 +77,94 @@ test_expect_success 'do partial clone 2, backfill min batch size' '
test_line_count = 0 revs2
'
+test_expect_success 'backfill --sparse' '
+ git clone --sparse --filter=blob:none \
+ --single-branch --branch=main \
+ "file://$(pwd)/srv.bare" backfill3 &&
+
+ # Initial checkout includes four files at root.
+ git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing &&
+ test_line_count = 44 missing &&
+
+ # Initial sparse-checkout is just the files at root, so we get the
+ # older versions of the four files at tip.
+ GIT_TRACE2_EVENT="$(pwd)/sparse-trace1" git \
+ -C backfill3 backfill --sparse &&
+ test_trace2_data promisor fetch_count 4 <sparse-trace1 &&
+ test_trace2_data path-walk paths 5 <sparse-trace1 &&
+ git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing &&
+ test_line_count = 40 missing &&
+
+ # Expand the sparse-checkout to include 'd' recursively. This
+ # engages the algorithm to skip the trees for 'a'. Note that
+ # the "sparse-checkout set" command downloads the objects at tip
+ # to satisfy the current checkout.
+ git -C backfill3 sparse-checkout set d &&
+ GIT_TRACE2_EVENT="$(pwd)/sparse-trace2" git \
+ -C backfill3 backfill --sparse &&
+ test_trace2_data promisor fetch_count 8 <sparse-trace2 &&
+ test_trace2_data path-walk paths 15 <sparse-trace2 &&
+ git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing &&
+ test_line_count = 24 missing
+'
+
+test_expect_success 'backfill --sparse without cone mode (positive)' '
+ git clone --no-checkout --filter=blob:none \
+ --single-branch --branch=main \
+ "file://$(pwd)/srv.bare" backfill4 &&
+
+ # No blobs yet
+ git -C backfill4 rev-list --quiet --objects --missing=print HEAD >missing &&
+ test_line_count = 48 missing &&
+
+ # Define sparse-checkout by filename regardless of parent directory.
+ # This downloads 6 blobs to satisfy the checkout.
+ git -C backfill4 sparse-checkout set --no-cone "**/file.1.txt" &&
+ git -C backfill4 checkout main &&
+
+ # Track new blob count
+ git -C backfill4 rev-list --quiet --objects --missing=print HEAD >missing &&
+ test_line_count = 42 missing &&
+
+ GIT_TRACE2_EVENT="$(pwd)/no-cone-trace1" git \
+ -C backfill4 backfill --sparse &&
+ test_trace2_data promisor fetch_count 6 <no-cone-trace1 &&
+
+ # This walk needed to visit all directories to search for these paths.
+ test_trace2_data path-walk paths 12 <no-cone-trace1 &&
+ git -C backfill4 rev-list --quiet --objects --missing=print HEAD >missing &&
+ test_line_count = 36 missing
+'
+
+test_expect_success 'backfill --sparse without cone mode (negative)' '
+ git clone --no-checkout --filter=blob:none \
+ --single-branch --branch=main \
+ "file://$(pwd)/srv.bare" backfill5 &&
+
+ # No blobs yet
+ git -C backfill5 rev-list --quiet --objects --missing=print HEAD >missing &&
+ test_line_count = 48 missing &&
+
+ # Define sparse-checkout by filename regardless of parent directory.
+ # This downloads 18 blobs to satisfy the checkout
+ git -C backfill5 sparse-checkout set --no-cone "**/file*" "!**/file.1.txt" &&
+ git -C backfill5 checkout main &&
+
+ # Track new blob count
+ git -C backfill5 rev-list --quiet --objects --missing=print HEAD >missing &&
+ test_line_count = 30 missing &&
+
+ GIT_TRACE2_EVENT="$(pwd)/no-cone-trace2" git \
+ -C backfill5 backfill --sparse &&
+ test_trace2_data promisor fetch_count 18 <no-cone-trace2 &&
+
+ # This walk needed to visit all directories to search for these paths, plus
+ # 12 extra "file.?.txt" paths than the previous test.
+ test_trace2_data path-walk paths 24 <no-cone-trace2 &&
+ git -C backfill5 rev-list --quiet --objects --missing=print HEAD >missing &&
+ test_line_count = 12 missing
+'
+
. "$TEST_DIRECTORY"/lib-httpd.sh
start_httpd
diff --git a/t/t6601-path-walk.sh b/t/t6601-path-walk.sh
index 5f04acb8a2..c89b0f1e19 100755
--- a/t/t6601-path-walk.sh
+++ b/t/t6601-path-walk.sh
@@ -176,6 +176,38 @@ test_expect_success 'branches and indexed objects mix well' '
test_cmp_sorted expect out
'
+test_expect_success 'base & topic, sparse' '
+ cat >patterns <<-EOF &&
+ /*
+ !/*/
+ /left/
+ EOF
+
+ test-tool path-walk --stdin-pl -- base topic <patterns >out &&
+
+ cat >expect <<-EOF &&
+ 0:commit::$(git rev-parse topic)
+ 0:commit::$(git rev-parse base)
+ 0:commit::$(git rev-parse base~1)
+ 0:commit::$(git rev-parse base~2)
+ 1:tree::$(git rev-parse topic^{tree})
+ 1:tree::$(git rev-parse base^{tree})
+ 1:tree::$(git rev-parse base~1^{tree})
+ 1:tree::$(git rev-parse base~2^{tree})
+ 2:blob:a:$(git rev-parse base~2:a)
+ 3:tree:left/:$(git rev-parse base:left)
+ 3:tree:left/:$(git rev-parse base~2:left)
+ 4:blob:left/b:$(git rev-parse base~2:left/b)
+ 4:blob:left/b:$(git rev-parse base:left/b)
+ blobs:3
+ commits:4
+ tags:0
+ trees:6
+ EOF
+
+ test_cmp_sorted expect out
+'
+
test_expect_success 'topic only' '
test-tool path-walk -- topic >out &&