From 6ad11e2c28d9c3b3d5dcb8986b726d2d2db18cc3 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 13 May 2026 12:06:19 -0700 Subject: [PATCH 1/4] promisor-remote: document caller filtering contract promisor_remote_get_direct() does not, on its happy path, filter out OIDs that are already present in the local object store: every OID the caller supplies is written to the fetch subprocess's stdin and ends up in the response pack. The only filtering it performs is in remove_fetched_oids(), and that only runs after a fetch failure when falling back to a different configured promisor remote. Almost every existing caller already filters locally-present OIDs out itself (typically with odb_read_object_info_extended() and OBJECT_INFO_FOR_PREFETCH, or odb_has_object() with no fetch flag). But the existing API comment does not state this expectation, so a new caller is easy to write incorrectly (I missed this originally and wrote two problematic callers). Omitting the filter still "works" in the sense that the desired objects end up local, but it silently makes the fetch request -- and the response pack -- larger than necessary, defeating part of the point of batching. Spell the contract out so future callers know to filter (and deduplicate) themselves, and point them at the helpers they should use to check local presence without accidentally triggering a lazy fetch. Signed-off-by: Elijah Newren --- promisor-remote.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/promisor-remote.h b/promisor-remote.h index 3d4d2de01818ea..301f5ac5cb4b58 100644 --- a/promisor-remote.h +++ b/promisor-remote.h @@ -29,6 +29,17 @@ int repo_has_promisor_remote(struct repository *r); * Fetches all requested objects from all promisor remotes, trying them one at * a time until all objects are fetched. * + * Callers are responsible for filtering out OIDs that are already present + * locally before calling this function: every supplied OID is sent in the + * fetch request, even if the object already exists in the local object + * store. (Only after a fetch failure does this function fall back to + * stripping already-present OIDs from the list before trying the next + * configured promisor remote.) Callers should also deduplicate the OIDs. + * + * To test for local presence without triggering a lazy fetch (which would + * defeat the purpose of batching), use odb_has_object(..., 0) or + * odb_read_object_info_extended() with OBJECT_INFO_FOR_PREFETCH. + * * If oid_nr is 0, this function returns immediately. */ void promisor_remote_get_direct(struct repository *repo, From 08a2c6517bfc75fd7ee514fa513b6f57659acca7 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 11 Apr 2026 20:47:18 -0700 Subject: [PATCH 2/4] patch-ids.h: add missing trailing parenthesis in documentation comment Signed-off-by: Elijah Newren --- patch-ids.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/patch-ids.h b/patch-ids.h index 490d739371666e..57534ee7222560 100644 --- a/patch-ids.h +++ b/patch-ids.h @@ -37,7 +37,7 @@ int has_commit_patch_id(struct commit *commit, struct patch_ids *); * struct patch_id *cur; * for (cur = patch_id_iter_first(commit, ids); * cur; - * cur = patch_id_iter_next(cur, ids) { + * cur = patch_id_iter_next(cur, ids)) { * ... look at cur->commit * } */ From c0655e5d41012d6d11caa018d6f4b222426f2c7b Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 7 Apr 2026 12:16:33 -0700 Subject: [PATCH 3/4] builtin/log: prefetch necessary blobs for `git cherry` In partial clones, `git cherry` fetches necessary blobs on-demand one at a time, which can be very slow. We would like to prefetch all necessary blobs upfront. To do so, we need to be able to first figure out which blobs are needed. `git cherry` does its work in a two-phase approach: first computing header-only IDs (based on file paths and modes), then falling back to full content-based IDs only when header-only IDs collide -- or, more accurately, whenever the oidhash() of the header-only object_ids collide. patch-ids.c handles this by creating an ids->patches hashmap that has all the data we need, but the problem is that any attempt to query the hashmap will invoke the patch_id_neq() function on any colliding objects, which causes the on-demand fetching. Insert a new prefetch_cherry_blobs() function before checking for collisions. Use a temporary replacement on the ids->patches.cmpfn in order to enumerate the blobs that would be needed without yet fetching them, and then fetch them all at once, then restore the old ids->patches.cmpfn. Signed-off-by: Elijah Newren --- builtin/log.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++ t/t3500-cherry.sh | 27 ++++++++++ 2 files changed, 158 insertions(+) diff --git a/builtin/log.c b/builtin/log.c index 8c0939dd42ada2..e464b30af4bcae 100644 --- a/builtin/log.c +++ b/builtin/log.c @@ -21,10 +21,12 @@ #include "color.h" #include "commit.h" #include "diff.h" +#include "diffcore.h" #include "diff-merges.h" #include "revision.h" #include "log-tree.h" #include "oid-array.h" +#include "oidset.h" #include "tag.h" #include "reflog-walk.h" #include "patch-ids.h" @@ -43,9 +45,11 @@ #include "utf8.h" #include "commit-reach.h" +#include "promisor-remote.h" #include "range-diff.h" #include "tmp-objdir.h" #include "tree.h" +#include "userdiff.h" #include "write-or-die.h" #define MAIL_DEFAULT_WRAP 72 @@ -2602,6 +2606,131 @@ static void print_commit(char sign, struct commit *commit, int verbose, } } +/* + * Enumerate blob OIDs from a single commit's diff, inserting them into blobs. + * Skips files whose userdiff driver explicitly declares binary status + * (drv->binary > 0), since patch-ID uses oid_to_hex() for those and + * never reads blob content. Use userdiff_find_by_path() since + * diff_filespec_load_driver() is static in diff.c. + * + * Clean up with diff_queue_clear() (from diffcore.h). + */ +static void collect_diff_blob_oids(struct commit *commit, + struct diff_options *opts, + struct oidset *blobs) +{ + struct diff_queue_struct *q; + + /* + * Merge commits are filtered out by patch_id_defined() in patch-ids.c, + * so we'll never be called with one. + */ + assert(!commit->parents || !commit->parents->next); + + if (commit->parents) + diff_tree_oid(&commit->parents->item->object.oid, + &commit->object.oid, "", opts); + else + diff_root_tree_oid(&commit->object.oid, "", opts); + diffcore_std(opts); + + q = &diff_queued_diff; + for (int i = 0; i < q->nr; i++) { + struct diff_filepair *p = q->queue[i]; + struct userdiff_driver *drv; + + /* Skip binary files */ + drv = userdiff_find_by_path(opts->repo->index, p->one->path); + if (drv && drv->binary > 0) + continue; + + if (DIFF_FILE_VALID(p->one) && + odb_read_object_info_extended(opts->repo->objects, + &p->one->oid, NULL, + OBJECT_INFO_FOR_PREFETCH)) + oidset_insert(blobs, &p->one->oid); + if (DIFF_FILE_VALID(p->two) && + odb_read_object_info_extended(opts->repo->objects, + &p->two->oid, NULL, + OBJECT_INFO_FOR_PREFETCH)) + oidset_insert(blobs, &p->two->oid); + } + diff_queue_clear(q); +} + +static int always_match(const void *cmp_data UNUSED, + const struct hashmap_entry *entry1 UNUSED, + const struct hashmap_entry *entry2 UNUSED, + const void *keydata UNUSED) +{ + return 0; +} + +/* + * Prefetch blobs for git cherry in partial clones. + * + * Called between the revision walk (which builds the head-side + * commit list) and the has_commit_patch_id() comparison loop. + * + * Uses a cmpfn-swap trick to avoid reading blobs: temporarily + * replaces the hashmap's comparison function with a trivial + * always-match function, so hashmap_get()/hashmap_get_next() match + * any entry with the same oidhash bucket. These are the set of oids + * that would trigger patch_id_neq() during normal lookup and cause + * blobs to be read on demand, and we want to prefetch them all at + * once instead. + */ +static void prefetch_cherry_blobs(struct repository *repo, + struct commit_list *list, + struct patch_ids *ids) +{ + struct oidset blobs = OIDSET_INIT; + hashmap_cmp_fn original_cmpfn; + + /* Exit if we're not in a partial clone */ + if (!repo_has_promisor_remote(repo)) + return; + + /* Save original cmpfn, replace with always_match */ + original_cmpfn = ids->patches.cmpfn; + ids->patches.cmpfn = always_match; + + /* Find header-only collisions, gather blobs from those commits */ + for (struct commit_list *l = list; l; l = l->next) { + struct commit *c = l->item; + bool match_found = false; + for (struct patch_id *cur = patch_id_iter_first(c, ids); + cur; + cur = patch_id_iter_next(cur, ids)) { + match_found = true; + collect_diff_blob_oids(cur->commit, &ids->diffopts, + &blobs); + } + if (match_found) + collect_diff_blob_oids(c, &ids->diffopts, &blobs); + } + + /* Restore original cmpfn */ + ids->patches.cmpfn = original_cmpfn; + + /* If we have any blobs to fetch, fetch them */ + if (oidset_size(&blobs)) { + struct oid_array to_fetch = OID_ARRAY_INIT; + struct oidset_iter iter; + const struct object_id *oid; + + oidset_iter_init(&blobs, &iter); + while ((oid = oidset_iter_next(&iter))) + oid_array_append(&to_fetch, oid); + + promisor_remote_get_direct(repo, to_fetch.oid, to_fetch.nr); + + oid_array_clear(&to_fetch); + } + + oidset_clear(&blobs); +} + int cmd_cherry(int argc, const char **argv, const char *prefix, @@ -2673,6 +2802,8 @@ int cmd_cherry(int argc, commit_list_insert(commit, &list); } + prefetch_cherry_blobs(the_repository, list, &ids); + for (struct commit_list *l = list; l; l = l->next) { char sign = '+'; diff --git a/t/t3500-cherry.sh b/t/t3500-cherry.sh index 78c3eac54b599e..3e66827d7641d8 100755 --- a/t/t3500-cherry.sh +++ b/t/t3500-cherry.sh @@ -78,4 +78,31 @@ test_expect_success 'cherry ignores whitespace' ' test_cmp expect actual ' +# Reuse the expect file from the previous test, in a partial clone +test_expect_success 'cherry in partial clone does bulk prefetch' ' + test_config uploadpack.allowfilter 1 && + test_config uploadpack.allowanysha1inwant 1 && + test_when_finished "rm -rf copy" && + + git clone --bare --filter=blob:none file://"$(pwd)" copy && + ( + cd copy && + GIT_TRACE2_EVENT="$(pwd)/trace.output" git cherry upstream-with-space feature-without-space >actual && + test_cmp ../expect actual && + + grep "child_start.*fetch.negotiationAlgorithm" trace.output >fetches && + test_line_count = 1 fetches && + test_trace2_data promisor fetch_count 4 actual && + test_cmp ../expect actual && + + ! grep "child_start.*fetch.negotiationAlgorithm" trace2.output && + ! grep "\"key\":\"fetch_count\"" trace2.output + ) +' + test_done From 75d4ca7cff07a14b2f0beef4524623e541e140a8 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 7 Apr 2026 12:16:33 -0700 Subject: [PATCH 4/4] grep: prefetch necessary blobs In partial clones, `git grep` fetches necessary blobs on-demand one at a time, which can be very slow. In partial clones, add an extra preliminary walk over the tree similar to grep_tree() which collects the blobs of interest, and then prefetches them. Signed-off-by: Elijah Newren --- builtin/grep.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++++ t/t7810-grep.sh | 58 ++++++++++++++++++++ 2 files changed, 201 insertions(+) diff --git a/builtin/grep.c b/builtin/grep.c index e33285e5e69289..85656d8d3f4328 100644 --- a/builtin/grep.c +++ b/builtin/grep.c @@ -28,9 +28,12 @@ #include "object-file.h" #include "object-name.h" #include "odb.h" +#include "oid-array.h" +#include "oidset.h" #include "packfile.h" #include "pager.h" #include "path.h" +#include "promisor-remote.h" #include "read-cache-ll.h" #include "write-or-die.h" @@ -692,6 +695,144 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec, return hit; } +static void collect_blob_oids_for_tree(struct repository *repo, + const struct pathspec *pathspec, + struct tree_desc *tree, + struct strbuf *base, + int tn_len, + struct oidset *blob_oids) +{ + struct name_entry entry; + int old_baselen = base->len; + struct strbuf name = STRBUF_INIT; + enum interesting match = entry_not_interesting; + + while (tree_entry(tree, &entry)) { + if (match != all_entries_interesting) { + strbuf_addstr(&name, base->buf + tn_len); + match = tree_entry_interesting(repo->index, + &entry, &name, + pathspec); + strbuf_reset(&name); + + if (match == all_entries_not_interesting) + break; + if (match == entry_not_interesting) + continue; + } + + strbuf_add(base, entry.path, tree_entry_len(&entry)); + + if (S_ISREG(entry.mode)) { + if (!odb_has_object(repo->objects, &entry.oid, 0)) + oidset_insert(blob_oids, &entry.oid); + } else if (S_ISDIR(entry.mode)) { + enum object_type type; + struct tree_desc sub_tree; + void *data; + unsigned long size; + + data = odb_read_object(repo->objects, &entry.oid, + &type, &size); + if (!data) + die(_("unable to read tree (%s)"), + oid_to_hex(&entry.oid)); + + strbuf_addch(base, '/'); + init_tree_desc(&sub_tree, &entry.oid, data, size); + collect_blob_oids_for_tree(repo, pathspec, &sub_tree, + base, tn_len, blob_oids); + free(data); + } + /* + * ...no else clause for S_ISGITLINK: submodules have their + * own promisor configuration and would need separate fetches + * anyway. + */ + + strbuf_setlen(base, old_baselen); + } + + strbuf_release(&name); +} + +static void collect_blob_oids_for_treeish(struct grep_opt *opt, + const struct pathspec *pathspec, + const struct object_id *tree_ish_oid, + const char *name, + struct oidset *blob_oids) +{ + struct tree_desc tree; + void *data; + unsigned long size; + struct strbuf base = STRBUF_INIT; + int len; + + data = odb_read_object_peeled(opt->repo->objects, tree_ish_oid, + OBJ_TREE, &size, NULL); + + if (!data) + return; + + len = name ? strlen(name) : 0; + if (len) { + strbuf_add(&base, name, len); + strbuf_addch(&base, ':'); + } + init_tree_desc(&tree, tree_ish_oid, data, size); + + collect_blob_oids_for_tree(opt->repo, pathspec, &tree, + &base, base.len, blob_oids); + + strbuf_release(&base); + free(data); +} + +static void prefetch_grep_blobs(struct grep_opt *opt, + const struct pathspec *pathspec, + const struct object_array *list) +{ + struct oidset blob_oids = OIDSET_INIT; + + /* Exit if we're not in a partial clone */ + if (!repo_has_promisor_remote(opt->repo)) + return; + + /* For each tree, gather the blobs in it */ + for (int i = 0; i < list->nr; i++) { + struct object *real_obj; + + obj_read_lock(); + real_obj = deref_tag(opt->repo, list->objects[i].item, + NULL, 0); + obj_read_unlock(); + + if (real_obj && + (real_obj->type == OBJ_COMMIT || + real_obj->type == OBJ_TREE)) + collect_blob_oids_for_treeish(opt, pathspec, + &real_obj->oid, + list->objects[i].name, + &blob_oids); + } + + /* Prefetch the blobs we found */ + if (oidset_size(&blob_oids)) { + struct oid_array to_fetch = OID_ARRAY_INIT; + struct oidset_iter iter; + const struct object_id *oid; + + oidset_iter_init(&blob_oids, &iter); + while ((oid = oidset_iter_next(&iter))) + oid_array_append(&to_fetch, oid); + + promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr); + + oid_array_clear(&to_fetch); + } + oidset_clear(&blob_oids); +} + static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec, struct object *obj, const char *name, const char *path) { @@ -732,6 +873,8 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec, int hit = 0; const unsigned int nr = list->nr; + prefetch_grep_blobs(opt, pathspec, list); + for (i = 0; i < nr; i++) { struct object *real_obj; diff --git a/t/t7810-grep.sh b/t/t7810-grep.sh index 64ac4f04ee97ad..3d08fd2a0c70b1 100755 --- a/t/t7810-grep.sh +++ b/t/t7810-grep.sh @@ -1929,4 +1929,62 @@ test_expect_success 'grep does not report i-t-a and assume unchanged with -L' ' test_cmp expected actual ' +test_expect_success 'grep of revision in partial clone batches prefetch and honors pathspec' ' + test_when_finished "rm -rf grep-partial-src grep-partial" && + + git init grep-partial-src && + ( + cd grep-partial-src && + git config uploadpack.allowfilter 1 && + git config uploadpack.allowanysha1inwant 1 && + mkdir a b && + echo "needle in haystack" >a/matches.txt && + echo "nothing to see here" >a/nomatch.txt && + echo "needle again" >b/matches.md && + git add . && + git commit -m "initial" + ) && + + git clone --no-checkout --filter=blob:none \ + "file://$(pwd)/grep-partial-src" grep-partial && + + # All three blobs are missing immediately after a blobless clone. + git -C grep-partial rev-list --quiet --objects \ + --missing=print HEAD >missing && + test_line_count = 3 missing && + + # A pathspec-limited grep should prefetch only the two blobs + # in a/. It should fetch both blobs in one batched request. + GIT_TRACE2_EVENT="$(pwd)/grep-trace-pathspec" \ + git -C grep-partial grep -c "needle" HEAD -- "a/*.txt" >result && + + # Only a/matches.txt contains "needle" among the matched paths. + test_line_count = 1 result && + + # Exactly the two a/*.txt blobs should have been requested, and + # the server packed those two objects in the response. + test_trace2_data promisor fetch_count 2 missing && + test_line_count = 1 missing && + + # A second grep without a pathspec must recurse into both + # subdirectories, but should request only the still-missing blob + # from the promisor. + GIT_TRACE2_EVENT="$(pwd)/grep-trace-all" \ + git -C grep-partial grep -c "needle" HEAD >result && + + test_line_count = 2 result && + test_trace2_data promisor fetch_count 1 missing && + test_line_count = 0 missing +' + test_done