5 files changed, 423 insertions, 204 deletions
diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py
index 751ec1ef7f..6f401e4a97 100644
--- a/lib/spack/spack/binary_distribution.py
+++ b/lib/spack/spack/binary_distribution.py
@@ -5,7 +5,6 @@
 
 import codecs
 import collections
-import errno
 import hashlib
 import io
 import itertools
@@ -23,8 +22,7 @@ import urllib.error
 import urllib.parse
 import urllib.request
 import warnings
-from contextlib import closing, contextmanager
-from gzip import GzipFile
+from contextlib import closing
 from typing import Dict, Iterable, List, NamedTuple, Optional, Set, Tuple
 from urllib.error import HTTPError, URLError
 
@@ -50,6 +48,7 @@ import spack.repo
 import spack.stage
 import spack.store
 import spack.traverse as traverse
+import spack.util.archive
 import spack.util.crypto
 import spack.util.file_cache as file_cache
 import spack.util.gpg
@@ -1133,205 +1132,46 @@ def generate_key_index(key_prefix, tmpdir=None):
                 shutil.rmtree(tmpdir)
 
 
-@contextmanager
-def gzip_compressed_tarfile(path):
-    """Create a reproducible, compressed tarfile"""
-    # Create gzip compressed tarball of the install prefix
-    # 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
-    #    If the filename="" is dropped, Python will use fileobj.name instead.
-    #    This should effectively mimick `gzip --no-name`.
-    # 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
-    # compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
-    # compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
-    # So we follow gzip.
-    with open(path, "wb") as f, ChecksumWriter(f) as inner_checksum, closing(
-        GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=inner_checksum)
-    ) as gzip_file, ChecksumWriter(gzip_file) as outer_checksum, tarfile.TarFile(
-        name="", mode="w", fileobj=outer_checksum
-    ) as tar:
-        yield tar, inner_checksum, outer_checksum
-
-
-def _tarinfo_name(absolute_path: str, *, _path=pathlib.PurePath) -> str:
-    """Compute tarfile entry name as the relative path from the (system) root."""
-    return _path(*_path(absolute_path).parts[1:]).as_posix()
-
-
 def tarfile_of_spec_prefix(tar: tarfile.TarFile, prefix: str) -> None:
     """Create a tarfile of an install prefix of a spec. Skips existing buildinfo file.
-    Only adds regular files, symlinks and dirs. Skips devices, fifos. Preserves hardlinks.
-    Normalizes permissions like git. Tar entries are added in depth-first pre-order, with
-    dir entries partitioned by file | dir, and sorted alphabetically, for reproducibility.
-    Partitioning ensures only one dir is in memory at a time, and sorting improves compression.
 
     Args:
         tar: tarfile object to add files to
         prefix: absolute install prefix of spec"""
     if not os.path.isabs(prefix) or not os.path.isdir(prefix):
         raise ValueError(f"prefix '{prefix}' must be an absolute path to a directory")
-    hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict()
     stat_key = lambda stat: (stat.st_dev, stat.st_ino)
 
     try:  # skip buildinfo file if it exists
         files_to_skip = [stat_key(os.lstat(buildinfo_file_name(prefix)))]
+        skip = lambda entry: stat_key(entry.stat(follow_symlinks=False)) in files_to_skip
     except OSError:
-        files_to_skip = []
-
-    # First add all directories leading up to `prefix` (Spack <= 0.21 did not do this, leading to
-    # issues when tarballs are used in runtimes like AWS lambda). Skip the file system root.
-    parent_dirs = reversed(pathlib.Path(prefix).parents)
-    next(parent_dirs)  # skip the root: slices are supported from python 3.10
-    for parent_dir in parent_dirs:
-        dir_info = tarfile.TarInfo(_tarinfo_name(str(parent_dir)))
-        dir_info.type = tarfile.DIRTYPE
-        dir_info.mode = 0o755
-        tar.addfile(dir_info)
-
-    dir_stack = [prefix]
-    while dir_stack:
-        dir = dir_stack.pop()
-
-        # Add the dir before its contents
-        dir_info = tarfile.TarInfo(_tarinfo_name(dir))
-        dir_info.type = tarfile.DIRTYPE
-        dir_info.mode = 0o755
-        tar.addfile(dir_info)
-
-        # Sort by name: reproducible & improves compression
-        with os.scandir(dir) as it:
-            entries = sorted(it, key=lambda entry: entry.name)
-
-        new_dirs = []
-        for entry in entries:
-            if entry.is_dir(follow_symlinks=False):
-                new_dirs.append(entry.path)
-                continue
-
-            file_info = tarfile.TarInfo(_tarinfo_name(entry.path))
-
-            s = entry.stat(follow_symlinks=False)
-
-            # Skip existing binary distribution files.
-            id = stat_key(s)
-            if id in files_to_skip:
-                continue
-
-            # Normalize the mode
-            file_info.mode = 0o644 if s.st_mode & 0o100 == 0 else 0o755
-
-            if entry.is_symlink():
-                file_info.type = tarfile.SYMTYPE
-                file_info.linkname = os.readlink(entry.path)
-                tar.addfile(file_info)
-
-            elif entry.is_file(follow_symlinks=False):
-                # Deduplicate hardlinks
-                if s.st_nlink > 1:
-                    if id in hardlink_to_tarinfo_name:
-                        file_info.type = tarfile.LNKTYPE
-                        file_info.linkname = hardlink_to_tarinfo_name[id]
-                        tar.addfile(file_info)
-                        continue
-                    hardlink_to_tarinfo_name[id] = file_info.name
-
-                # If file not yet seen, copy it.
-                file_info.type = tarfile.REGTYPE
-                file_info.size = s.st_size
-
-                with open(entry.path, "rb") as f:
-                    tar.addfile(file_info, f)
-
-        dir_stack.extend(reversed(new_dirs))  # we pop, so reverse to stay alphabetical
-
-
-class ChecksumWriter(io.BufferedIOBase):
-    """Checksum writer computes a checksum while writing to a file."""
-
-    myfileobj = None
-
-    def __init__(self, fileobj, algorithm=hashlib.sha256):
-        self.fileobj = fileobj
-        self.hasher = algorithm()
-        self.length = 0
-
-    def hexdigest(self):
-        return self.hasher.hexdigest()
-
-    def write(self, data):
-        if isinstance(data, (bytes, bytearray)):
-            length = len(data)
-        else:
-            data = memoryview(data)
-            length = data.nbytes
-
-        if length > 0:
-            self.fileobj.write(data)
-            self.hasher.update(data)
-
-        self.length += length
-
-        return length
-
-    def read(self, size=-1):
-        raise OSError(errno.EBADF, "read() on write-only object")
-
-    def read1(self, size=-1):
-        raise OSError(errno.EBADF, "read1() on write-only object")
-
-    def peek(self, n):
-        raise OSError(errno.EBADF, "peek() on write-only object")
-
-    @property
-    def closed(self):
-        return self.fileobj is None
-
-    def close(self):
-        fileobj = self.fileobj
-        if fileobj is None:
-            return
-        self.fileobj.close()
-        self.fileobj = None
-
-    def flush(self):
-        self.fileobj.flush()
-
-    def fileno(self):
-        return self.fileobj.fileno()
-
-    def rewind(self):
-        raise OSError("Can't rewind while computing checksum")
-
-    def readable(self):
-        return False
-
-    def writable(self):
-        return True
-
-    def seekable(self):
-        return True
-
-    def tell(self):
-        return self.fileobj.tell()
-
-    def seek(self, offset, whence=io.SEEK_SET):
-        # In principle forward seek is possible with b"0" padding,
-        # but this is not implemented.
-        if offset == 0 and whence == io.SEEK_CUR:
-            return
-        raise OSError("Can't seek while computing checksum")
-
-    def readline(self, size=-1):
-        raise OSError(errno.EBADF, "readline() on write-only object")
+        skip = lambda entry: False
+
+    spack.util.archive.reproducible_tarfile_from_prefix(
+        tar,
+        prefix,
+        # Spack <= 0.21 did not include parent directories, leading to issues when tarballs are
+        # used in runtimes like AWS lambda.
+        include_parent_directories=True,
+        skip=skip,
+    )
 
 
 def _do_create_tarball(tarfile_path: str, binaries_dir: str, buildinfo: dict):
-    with gzip_compressed_tarfile(tarfile_path) as (tar, inner_checksum, outer_checksum):
+    with spack.util.archive.gzip_compressed_tarfile(tarfile_path) as (
+        tar,
+        inner_checksum,
+        outer_checksum,
+    ):
         # Tarball the install prefix
         tarfile_of_spec_prefix(tar, binaries_dir)
 
         # Serialize buildinfo for the tarball
         bstring = syaml.dump(buildinfo, default_flow_style=True).encode("utf-8")
-        tarinfo = tarfile.TarInfo(name=_tarinfo_name(buildinfo_file_name(binaries_dir)))
+        tarinfo = tarfile.TarInfo(
+            name=spack.util.archive.default_path_to_name(buildinfo_file_name(binaries_dir))
+        )
         tarinfo.type = tarfile.REGTYPE
         tarinfo.size = len(bstring)
         tarinfo.mode = 0o644
diff --git a/lib/spack/spack/fetch_strategy.py b/lib/spack/spack/fetch_strategy.py
index 864fcddcc3..93df10c98f 100644
--- a/lib/spack/spack/fetch_strategy.py
+++ b/lib/spack/spack/fetch_strategy.py
@@ -30,6 +30,7 @@ import re
 import shutil
 import urllib.error
 import urllib.parse
+from pathlib import PurePath
 from typing import List, Optional
 
 import llnl.url
@@ -37,13 +38,14 @@ import llnl.util
 import llnl.util.filesystem as fs
 import llnl.util.tty as tty
 from llnl.string import comma_and, quote
-from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, temp_rename, working_dir
+from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, working_dir
 from llnl.util.symlink import symlink
 
 import spack.config
 import spack.error
 import spack.oci.opener
 import spack.url
+import spack.util.archive
 import spack.util.crypto as crypto
 import spack.util.git
 import spack.util.url as url_util
@@ -600,29 +602,21 @@ class VCSFetchStrategy(FetchStrategy):
         tty.debug("Source fetched with %s is already expanded." % self.url_attr)
 
     @_needs_stage
-    def archive(self, destination, **kwargs):
+    def archive(self, destination, *, exclude: Optional[str] = None):
         assert llnl.url.extension_from_path(destination) == "tar.gz"
         assert self.stage.source_path.startswith(self.stage.path)
-
-        tar = which("tar", required=True)
-
-        patterns = kwargs.get("exclude", None)
-        if patterns is not None:
-            if isinstance(patterns, str):
-                patterns = [patterns]
-            for p in patterns:
-                tar.add_default_arg("--exclude=%s" % p)
-
-        with working_dir(self.stage.path):
-            if self.stage.srcdir:
-                # Here we create an archive with the default repository name.
-                # The 'tar' command has options for changing the name of a
-                # directory that is included in the archive, but they differ
-                # based on OS, so we temporarily rename the repo
-                with temp_rename(self.stage.source_path, self.stage.srcdir):
-                    tar("-czf", destination, self.stage.srcdir)
-            else:
-                tar("-czf", destination, os.path.basename(self.stage.source_path))
+        # We need to prepend this dir name to every entry of the tarfile
+        top_level_dir = PurePath(self.stage.srcdir or os.path.basename(self.stage.source_path))
+
+        with working_dir(self.stage.source_path), spack.util.archive.gzip_compressed_tarfile(
+            destination
+        ) as (tar, _, _):
+            spack.util.archive.reproducible_tarfile_from_prefix(
+                tar=tar,
+                prefix=".",
+                skip=lambda entry: entry.name == exclude,
+                path_to_name=lambda path: (top_level_dir / PurePath(path)).as_posix(),
+            )
 
     def __str__(self):
         return "VCS: %s" % self.url
diff --git a/lib/spack/spack/test/oci/integration_test.py b/lib/spack/spack/test/oci/integration_test.py
index a2b1ac2f4f..8129dd22cf 100644
--- a/lib/spack/spack/test/oci/integration_test.py
+++ b/lib/spack/spack/test/oci/integration_test.py
@@ -13,11 +13,11 @@ from contextlib import contextmanager
 
 import spack.environment as ev
 import spack.oci.opener
-from spack.binary_distribution import gzip_compressed_tarfile
 from spack.main import SpackCommand
 from spack.oci.image import Digest, ImageReference, default_config, default_manifest
 from spack.oci.oci import blob_exists, get_manifest_and_config, upload_blob, upload_manifest
 from spack.test.oci.mock_registry import DummyServer, InMemoryOCIRegistry, create_opener
+from spack.util.archive import gzip_compressed_tarfile
 
 buildcache = SpackCommand("buildcache")
 mirror = SpackCommand("mirror")
diff --git a/lib/spack/spack/test/util/archive.py b/lib/spack/spack/test/util/archive.py
new file mode 100644
index 0000000000..9688f25924
--- /dev/null
+++ b/lib/spack/spack/test/util/archive.py
@@ -0,0 +1,157 @@
+# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+import gzip
+import hashlib
+import os
+import shutil
+import tarfile
+from pathlib import Path, PurePath
+
+import spack.util.crypto
+from spack.util.archive import gzip_compressed_tarfile, reproducible_tarfile_from_prefix
+
+
+def test_gzip_compressed_tarball_is_reproducible(tmpdir):
+    """Test gzip_compressed_tarfile and reproducible_tarfile_from_prefix for reproducibility"""
+
+    with tmpdir.as_cwd():
+        # Create a few directories
+        root = Path("root")
+        dir_a = root / "a"
+        dir_b = root / "b"
+        root.mkdir(mode=0o777)
+        dir_a.mkdir(mode=0o777)
+        dir_b.mkdir(mode=0o777)
+
+        (root / "y").touch()
+        (root / "x").touch()
+
+        (dir_a / "executable").touch(mode=0o777)
+        (dir_a / "data").touch(mode=0o666)
+        (dir_a / "symlink_file").symlink_to("data")
+        (dir_a / "symlink_dir").symlink_to(PurePath("..", "b"))
+        try:
+            os.link(dir_a / "executable", dir_a / "hardlink")
+            hardlink_support = True
+        except OSError:
+            hardlink_support = False
+
+        (dir_b / "executable").touch(mode=0o777)
+        (dir_b / "data").touch(mode=0o666)
+        (dir_b / "symlink_file").symlink_to("data")
+        (dir_b / "symlink_dir").symlink_to(PurePath("..", "a"))
+
+        # Create the first tarball
+        with gzip_compressed_tarfile("fst.tar.gz") as (tar, gzip_checksum_1, tarfile_checksum_1):
+            reproducible_tarfile_from_prefix(tar, "root")
+
+        # Expected mode for non-dirs is 644 if not executable, 755 if executable. Better to compute
+        # that as we don't know the umask of the user running the test.
+        expected_mode = (
+            lambda name: 0o755 if Path(*name.split("/")).lstat().st_mode & 0o100 else 0o644
+        )
+
+        # Verify the tarball contents
+        with tarfile.open("fst.tar.gz", "r:gz") as tar:
+            # Directories (mode is always 755)
+            for dir in ("root", "root/a", "root/b"):
+                m = tar.getmember(dir)
+                assert m.isdir()
+                assert m.mode == 0o755
+                assert m.uid == m.gid == 0
+                assert m.uname == m.gname == ""
+
+            # Non-executable regular files
+            for file in (
+                "root/x",
+                "root/y",
+                "root/a/data",
+                "root/b/data",
+                "root/a/executable",
+                "root/b/executable",
+            ):
+                m = tar.getmember(file)
+                assert m.isreg()
+                assert m.mode == expected_mode(file)
+                assert m.uid == m.gid == 0
+                assert m.uname == m.gname == ""
+
+            # Symlinks
+            for file in (
+                "root/a/symlink_file",
+                "root/a/symlink_dir",
+                "root/b/symlink_file",
+                "root/b/symlink_dir",
+            ):
+                m = tar.getmember(file)
+                assert m.issym()
+                assert m.mode == 0o755
+                assert m.uid == m.gid == m.mtime == 0
+                assert m.uname == m.gname == ""
+
+            # Verify the symlink targets. Notice that symlink targets are copied verbatim. That
+            # means the value is platform specific for relative symlinks within the current prefix,
+            # as on Windows they'd be ..\a and ..\b instead of ../a and ../b. So, reproducilility
+            # is only guaranteed per-platform currently.
+            assert PurePath(tar.getmember("root/a/symlink_file").linkname) == PurePath("data")
+            assert PurePath(tar.getmember("root/b/symlink_file").linkname) == PurePath("data")
+            assert PurePath(tar.getmember("root/a/symlink_dir").linkname) == PurePath("..", "b")
+            assert PurePath(tar.getmember("root/b/symlink_dir").linkname) == PurePath("..", "a")
+
+            # Check hardlink if supported
+            if hardlink_support:
+                m = tar.getmember("root/a/hardlink")
+                assert m.islnk()
+                assert m.mode == expected_mode("root/a/hardlink")
+                assert m.uid == m.gid == 0
+                assert m.uname == m.gname == ""
+                # Hardlink targets are always in posix format, as they reference a file that exists
+                # in the tarball.
+                assert m.linkname == "root/a/executable"
+
+            # Finally verify if entries are ordered by (is_dir, name)
+            assert [t.name for t in tar.getmembers()] == [
+                "root",
+                "root/x",
+                "root/y",
+                "root/a",
+                "root/a/data",
+                "root/a/executable",
+                *(["root/a/hardlink"] if hardlink_support else []),
+                "root/a/symlink_dir",
+                "root/a/symlink_file",
+                "root/b",
+                "root/b/data",
+                "root/b/executable",
+                "root/b/symlink_dir",
+                "root/b/symlink_file",
+            ]
+
+        # Delete the current root dir, extract the first tarball, create a second
+        shutil.rmtree(root)
+        with tarfile.open("fst.tar.gz", "r:gz") as tar:
+            tar.extractall()
+
+        # Create the second tarball
+        with gzip_compressed_tarfile("snd.tar.gz") as (tar, gzip_checksum_2, tarfile_checksum_2):
+            reproducible_tarfile_from_prefix(tar, "root")
+
+        # Verify the .tar.gz checksums are identical and correct
+        assert (
+            gzip_checksum_1.hexdigest()
+            == gzip_checksum_2.hexdigest()
+            == spack.util.crypto.checksum(hashlib.sha256, "fst.tar.gz")
+            == spack.util.crypto.checksum(hashlib.sha256, "snd.tar.gz")
+        )
+
+        # Verify the .tar checksums are identical and correct
+        with gzip.open("fst.tar.gz", "rb") as f, gzip.open("snd.tar.gz", "rb") as g:
+            assert (
+                tarfile_checksum_1.hexdigest()
+                == tarfile_checksum_2.hexdigest()
+                == spack.util.crypto.checksum_stream(hashlib.sha256, f)
+                == spack.util.crypto.checksum_stream(hashlib.sha256, g)
+            )
diff --git a/lib/spack/spack/util/archive.py b/lib/spack/spack/util/archive.py
new file mode 100644
index 0000000000..8bde40017c
--- /dev/null
+++ b/lib/spack/spack/util/archive.py
@@ -0,0 +1,228 @@
+# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+import errno
+import hashlib
+import io
+import os
+import pathlib
+import tarfile
+from contextlib import closing, contextmanager
+from gzip import GzipFile
+from typing import Callable, Dict, Tuple
+
+
+class ChecksumWriter(io.BufferedIOBase):
+    """Checksum writer computes a checksum while writing to a file."""
+
+    myfileobj = None
+
+    def __init__(self, fileobj, algorithm=hashlib.sha256):
+        self.fileobj = fileobj
+        self.hasher = algorithm()
+        self.length = 0
+
+    def hexdigest(self):
+        return self.hasher.hexdigest()
+
+    def write(self, data):
+        if isinstance(data, (bytes, bytearray)):
+            length = len(data)
+        else:
+            data = memoryview(data)
+            length = data.nbytes
+
+        if length > 0:
+            self.fileobj.write(data)
+            self.hasher.update(data)
+
+        self.length += length
+
+        return length
+
+    def read(self, size=-1):
+        raise OSError(errno.EBADF, "read() on write-only object")
+
+    def read1(self, size=-1):
+        raise OSError(errno.EBADF, "read1() on write-only object")
+
+    def peek(self, n):
+        raise OSError(errno.EBADF, "peek() on write-only object")
+
+    @property
+    def closed(self):
+        return self.fileobj is None
+
+    def close(self):
+        fileobj = self.fileobj
+        if fileobj is None:
+            return
+        self.fileobj.close()
+        self.fileobj = None
+
+    def flush(self):
+        self.fileobj.flush()
+
+    def fileno(self):
+        return self.fileobj.fileno()
+
+    def rewind(self):
+        raise OSError("Can't rewind while computing checksum")
+
+    def readable(self):
+        return False
+
+    def writable(self):
+        return True
+
+    def seekable(self):
+        return True
+
+    def tell(self):
+        return self.fileobj.tell()
+
+    def seek(self, offset, whence=io.SEEK_SET):
+        # In principle forward seek is possible with b"0" padding,
+        # but this is not implemented.
+        if offset == 0 and whence == io.SEEK_CUR:
+            return
+        raise OSError("Can't seek while computing checksum")
+
+    def readline(self, size=-1):
+        raise OSError(errno.EBADF, "readline() on write-only object")
+
+
+@contextmanager
+def gzip_compressed_tarfile(path):
+    """Create a reproducible, gzip compressed tarfile, and keep track of shasums of both the
+    compressed and uncompressed tarfile. Reproduciblity is achived by normalizing the gzip header
+    (no file name and zero mtime).
+
+    Yields a tuple of the following:
+        tarfile.TarFile: tarfile object
+        ChecksumWriter: checksum of the gzip compressed tarfile
+        ChecksumWriter: checksum of the uncompressed tarfile
+    """
+    # Create gzip compressed tarball of the install prefix
+    # 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
+    #    If the filename="" is dropped, Python will use fileobj.name instead.
+    #    This should effectively mimick `gzip --no-name`.
+    # 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
+    # compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
+    # compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
+    # So we follow gzip.
+    with open(path, "wb") as f, ChecksumWriter(f) as gzip_checksum, closing(
+        GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=gzip_checksum)
+    ) as gzip_file, ChecksumWriter(gzip_file) as tarfile_checksum, tarfile.TarFile(
+        name="", mode="w", fileobj=tarfile_checksum
+    ) as tar:
+        yield tar, gzip_checksum, tarfile_checksum
+
+
+def default_path_to_name(path: str) -> str:
+    """Converts a path to a tarfile name, which uses posix path separators."""
+    p = pathlib.PurePath(path)
+    # Drop the leading slash on posix and the drive letter on windows, and always format as a
+    # posix path.
+    return pathlib.PurePath(*p.parts[1:]).as_posix() if p.is_absolute() else p.as_posix()
+
+
+def reproducible_tarfile_from_prefix(
+    tar: tarfile.TarFile,
+    prefix: str,
+    *,
+    include_parent_directories: bool = False,
+    skip: Callable[[os.DirEntry], bool] = lambda entry: False,
+    path_to_name: Callable[[str], str] = default_path_to_name,
+) -> None:
+    """Create a tarball from a given directory. Only adds regular files, symlinks and dirs.
+    Skips devices, fifos. Preserves hardlinks. Normalizes permissions like git. Tar entries are
+    added in depth-first pre-order, with dir entries partitioned by file | dir, and sorted
+    lexicographically, for reproducibility. Partitioning ensures only one dir is in memory at a
+    time, and sorting improves compression.
+
+    Args:
+        tar: tarfile object opened in write mode
+        prefix: path to directory to tar (either absolute or relative)
+        include_parent_directories: whether to include every directory leading up to ``prefix`` in
+            the tarball
+        skip: function that receives a DirEntry and returns True if the entry should be skipped,
+            whether it is a file or directory. Default implementation does not skip anything.
+        path_to_name: function that converts a path string to a tarfile entry name, which should be
+            in posix format. Not only is it necessary to transform paths in certain cases, such as
+            windows path to posix format, but it can also be used to prepend a directory to each
+            entry even if it does not exist on the filesystem. The default implementation drops the
+            leading slash on posix and the drive letter on windows for absolute paths, and formats
+            as a posix."""
+
+    hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict()
+
+    if include_parent_directories:
+        parent_dirs = reversed(pathlib.Path(prefix).parents)
+        next(parent_dirs)  # skip the root: slices are supported from python 3.10
+        for parent_dir in parent_dirs:
+            dir_info = tarfile.TarInfo(path_to_name(str(parent_dir)))
+            dir_info.type = tarfile.DIRTYPE
+            dir_info.mode = 0o755
+            tar.addfile(dir_info)
+
+    dir_stack = [prefix]
+    while dir_stack:
+        dir = dir_stack.pop()
+
+        # Add the dir before its contents
+        dir_info = tarfile.TarInfo(path_to_name(dir))
+        dir_info.type = tarfile.DIRTYPE
+        dir_info.mode = 0o755
+        tar.addfile(dir_info)
+
+        # Sort by name: reproducible & improves compression
+        with os.scandir(dir) as it:
+            entries = sorted(it, key=lambda entry: entry.name)
+
+        new_dirs = []
+        for entry in entries:
+            if skip(entry):
+                continue
+
+            if entry.is_dir(follow_symlinks=False):
+                new_dirs.append(entry.path)
+                continue
+
+            file_info = tarfile.TarInfo(path_to_name(entry.path))
+
+            if entry.is_symlink():
+                file_info.type = tarfile.SYMTYPE
+                file_info.linkname = os.readlink(entry.path)
+                # According to POSIX: "the value of the file mode bits returned in the
+                # st_mode field of the stat structure is unspecified." So we set it to
+                # something sensible without lstat'ing the link.
+                file_info.mode = 0o755
+                tar.addfile(file_info)
+
+            elif entry.is_file(follow_symlinks=False):
+                # entry.stat has zero (st_ino, st_dev, st_nlink) on Windows: use lstat.
+                s = os.lstat(entry.path)
+
+                # Normalize permissions like git
+                file_info.mode = 0o755 if s.st_mode & 0o100 else 0o644
+
+                # Deduplicate hardlinks
+                if s.st_nlink > 1:
+                    ident = (s.st_dev, s.st_ino)
+                    if ident in hardlink_to_tarinfo_name:
+                        file_info.type = tarfile.LNKTYPE
+                        file_info.linkname = hardlink_to_tarinfo_name[ident]
+                        tar.addfile(file_info)
+                        continue
+                    hardlink_to_tarinfo_name[ident] = file_info.name
+
+                # If file not yet seen, copy it
+                file_info.type = tarfile.REGTYPE
+                file_info.size = s.st_size
+
+                with open(entry.path, "rb") as f:
+                    tar.addfile(file_info, f)
+
+        dir_stack.extend(reversed(new_dirs))  # we pop, so reverse to stay alphabetical