summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/spack/spack/binary_distribution.py202
-rw-r--r--lib/spack/spack/fetch_strategy.py38
-rw-r--r--lib/spack/spack/test/oci/integration_test.py2
-rw-r--r--lib/spack/spack/test/util/archive.py157
-rw-r--r--lib/spack/spack/util/archive.py228
5 files changed, 423 insertions, 204 deletions
diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py
index 751ec1ef7f..6f401e4a97 100644
--- a/lib/spack/spack/binary_distribution.py
+++ b/lib/spack/spack/binary_distribution.py
@@ -5,7 +5,6 @@
import codecs
import collections
-import errno
import hashlib
import io
import itertools
@@ -23,8 +22,7 @@ import urllib.error
import urllib.parse
import urllib.request
import warnings
-from contextlib import closing, contextmanager
-from gzip import GzipFile
+from contextlib import closing
from typing import Dict, Iterable, List, NamedTuple, Optional, Set, Tuple
from urllib.error import HTTPError, URLError
@@ -50,6 +48,7 @@ import spack.repo
import spack.stage
import spack.store
import spack.traverse as traverse
+import spack.util.archive
import spack.util.crypto
import spack.util.file_cache as file_cache
import spack.util.gpg
@@ -1133,205 +1132,46 @@ def generate_key_index(key_prefix, tmpdir=None):
shutil.rmtree(tmpdir)
-@contextmanager
-def gzip_compressed_tarfile(path):
- """Create a reproducible, compressed tarfile"""
- # Create gzip compressed tarball of the install prefix
- # 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
- # If the filename="" is dropped, Python will use fileobj.name instead.
- # This should effectively mimick `gzip --no-name`.
- # 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
- # compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
- # compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
- # So we follow gzip.
- with open(path, "wb") as f, ChecksumWriter(f) as inner_checksum, closing(
- GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=inner_checksum)
- ) as gzip_file, ChecksumWriter(gzip_file) as outer_checksum, tarfile.TarFile(
- name="", mode="w", fileobj=outer_checksum
- ) as tar:
- yield tar, inner_checksum, outer_checksum
-
-
-def _tarinfo_name(absolute_path: str, *, _path=pathlib.PurePath) -> str:
- """Compute tarfile entry name as the relative path from the (system) root."""
- return _path(*_path(absolute_path).parts[1:]).as_posix()
-
-
def tarfile_of_spec_prefix(tar: tarfile.TarFile, prefix: str) -> None:
"""Create a tarfile of an install prefix of a spec. Skips existing buildinfo file.
- Only adds regular files, symlinks and dirs. Skips devices, fifos. Preserves hardlinks.
- Normalizes permissions like git. Tar entries are added in depth-first pre-order, with
- dir entries partitioned by file | dir, and sorted alphabetically, for reproducibility.
- Partitioning ensures only one dir is in memory at a time, and sorting improves compression.
Args:
tar: tarfile object to add files to
prefix: absolute install prefix of spec"""
if not os.path.isabs(prefix) or not os.path.isdir(prefix):
raise ValueError(f"prefix '{prefix}' must be an absolute path to a directory")
- hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict()
stat_key = lambda stat: (stat.st_dev, stat.st_ino)
try: # skip buildinfo file if it exists
files_to_skip = [stat_key(os.lstat(buildinfo_file_name(prefix)))]
+ skip = lambda entry: stat_key(entry.stat(follow_symlinks=False)) in files_to_skip
except OSError:
- files_to_skip = []
-
- # First add all directories leading up to `prefix` (Spack <= 0.21 did not do this, leading to
- # issues when tarballs are used in runtimes like AWS lambda). Skip the file system root.
- parent_dirs = reversed(pathlib.Path(prefix).parents)
- next(parent_dirs) # skip the root: slices are supported from python 3.10
- for parent_dir in parent_dirs:
- dir_info = tarfile.TarInfo(_tarinfo_name(str(parent_dir)))
- dir_info.type = tarfile.DIRTYPE
- dir_info.mode = 0o755
- tar.addfile(dir_info)
-
- dir_stack = [prefix]
- while dir_stack:
- dir = dir_stack.pop()
-
- # Add the dir before its contents
- dir_info = tarfile.TarInfo(_tarinfo_name(dir))
- dir_info.type = tarfile.DIRTYPE
- dir_info.mode = 0o755
- tar.addfile(dir_info)
-
- # Sort by name: reproducible & improves compression
- with os.scandir(dir) as it:
- entries = sorted(it, key=lambda entry: entry.name)
-
- new_dirs = []
- for entry in entries:
- if entry.is_dir(follow_symlinks=False):
- new_dirs.append(entry.path)
- continue
-
- file_info = tarfile.TarInfo(_tarinfo_name(entry.path))
-
- s = entry.stat(follow_symlinks=False)
-
- # Skip existing binary distribution files.
- id = stat_key(s)
- if id in files_to_skip:
- continue
-
- # Normalize the mode
- file_info.mode = 0o644 if s.st_mode & 0o100 == 0 else 0o755
-
- if entry.is_symlink():
- file_info.type = tarfile.SYMTYPE
- file_info.linkname = os.readlink(entry.path)
- tar.addfile(file_info)
-
- elif entry.is_file(follow_symlinks=False):
- # Deduplicate hardlinks
- if s.st_nlink > 1:
- if id in hardlink_to_tarinfo_name:
- file_info.type = tarfile.LNKTYPE
- file_info.linkname = hardlink_to_tarinfo_name[id]
- tar.addfile(file_info)
- continue
- hardlink_to_tarinfo_name[id] = file_info.name
-
- # If file not yet seen, copy it.
- file_info.type = tarfile.REGTYPE
- file_info.size = s.st_size
-
- with open(entry.path, "rb") as f:
- tar.addfile(file_info, f)
-
- dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical
-
-
-class ChecksumWriter(io.BufferedIOBase):
- """Checksum writer computes a checksum while writing to a file."""
-
- myfileobj = None
-
- def __init__(self, fileobj, algorithm=hashlib.sha256):
- self.fileobj = fileobj
- self.hasher = algorithm()
- self.length = 0
-
- def hexdigest(self):
- return self.hasher.hexdigest()
-
- def write(self, data):
- if isinstance(data, (bytes, bytearray)):
- length = len(data)
- else:
- data = memoryview(data)
- length = data.nbytes
-
- if length > 0:
- self.fileobj.write(data)
- self.hasher.update(data)
-
- self.length += length
-
- return length
-
- def read(self, size=-1):
- raise OSError(errno.EBADF, "read() on write-only object")
-
- def read1(self, size=-1):
- raise OSError(errno.EBADF, "read1() on write-only object")
-
- def peek(self, n):
- raise OSError(errno.EBADF, "peek() on write-only object")
-
- @property
- def closed(self):
- return self.fileobj is None
-
- def close(self):
- fileobj = self.fileobj
- if fileobj is None:
- return
- self.fileobj.close()
- self.fileobj = None
-
- def flush(self):
- self.fileobj.flush()
-
- def fileno(self):
- return self.fileobj.fileno()
-
- def rewind(self):
- raise OSError("Can't rewind while computing checksum")
-
- def readable(self):
- return False
-
- def writable(self):
- return True
-
- def seekable(self):
- return True
-
- def tell(self):
- return self.fileobj.tell()
-
- def seek(self, offset, whence=io.SEEK_SET):
- # In principle forward seek is possible with b"0" padding,
- # but this is not implemented.
- if offset == 0 and whence == io.SEEK_CUR:
- return
- raise OSError("Can't seek while computing checksum")
-
- def readline(self, size=-1):
- raise OSError(errno.EBADF, "readline() on write-only object")
+ skip = lambda entry: False
+
+ spack.util.archive.reproducible_tarfile_from_prefix(
+ tar,
+ prefix,
+ # Spack <= 0.21 did not include parent directories, leading to issues when tarballs are
+ # used in runtimes like AWS lambda.
+ include_parent_directories=True,
+ skip=skip,
+ )
def _do_create_tarball(tarfile_path: str, binaries_dir: str, buildinfo: dict):
- with gzip_compressed_tarfile(tarfile_path) as (tar, inner_checksum, outer_checksum):
+ with spack.util.archive.gzip_compressed_tarfile(tarfile_path) as (
+ tar,
+ inner_checksum,
+ outer_checksum,
+ ):
# Tarball the install prefix
tarfile_of_spec_prefix(tar, binaries_dir)
# Serialize buildinfo for the tarball
bstring = syaml.dump(buildinfo, default_flow_style=True).encode("utf-8")
- tarinfo = tarfile.TarInfo(name=_tarinfo_name(buildinfo_file_name(binaries_dir)))
+ tarinfo = tarfile.TarInfo(
+ name=spack.util.archive.default_path_to_name(buildinfo_file_name(binaries_dir))
+ )
tarinfo.type = tarfile.REGTYPE
tarinfo.size = len(bstring)
tarinfo.mode = 0o644
diff --git a/lib/spack/spack/fetch_strategy.py b/lib/spack/spack/fetch_strategy.py
index 864fcddcc3..93df10c98f 100644
--- a/lib/spack/spack/fetch_strategy.py
+++ b/lib/spack/spack/fetch_strategy.py
@@ -30,6 +30,7 @@ import re
import shutil
import urllib.error
import urllib.parse
+from pathlib import PurePath
from typing import List, Optional
import llnl.url
@@ -37,13 +38,14 @@ import llnl.util
import llnl.util.filesystem as fs
import llnl.util.tty as tty
from llnl.string import comma_and, quote
-from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, temp_rename, working_dir
+from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, working_dir
from llnl.util.symlink import symlink
import spack.config
import spack.error
import spack.oci.opener
import spack.url
+import spack.util.archive
import spack.util.crypto as crypto
import spack.util.git
import spack.util.url as url_util
@@ -600,29 +602,21 @@ class VCSFetchStrategy(FetchStrategy):
tty.debug("Source fetched with %s is already expanded." % self.url_attr)
@_needs_stage
- def archive(self, destination, **kwargs):
+ def archive(self, destination, *, exclude: Optional[str] = None):
assert llnl.url.extension_from_path(destination) == "tar.gz"
assert self.stage.source_path.startswith(self.stage.path)
-
- tar = which("tar", required=True)
-
- patterns = kwargs.get("exclude", None)
- if patterns is not None:
- if isinstance(patterns, str):
- patterns = [patterns]
- for p in patterns:
- tar.add_default_arg("--exclude=%s" % p)
-
- with working_dir(self.stage.path):
- if self.stage.srcdir:
- # Here we create an archive with the default repository name.
- # The 'tar' command has options for changing the name of a
- # directory that is included in the archive, but they differ
- # based on OS, so we temporarily rename the repo
- with temp_rename(self.stage.source_path, self.stage.srcdir):
- tar("-czf", destination, self.stage.srcdir)
- else:
- tar("-czf", destination, os.path.basename(self.stage.source_path))
+ # We need to prepend this dir name to every entry of the tarfile
+ top_level_dir = PurePath(self.stage.srcdir or os.path.basename(self.stage.source_path))
+
+ with working_dir(self.stage.source_path), spack.util.archive.gzip_compressed_tarfile(
+ destination
+ ) as (tar, _, _):
+ spack.util.archive.reproducible_tarfile_from_prefix(
+ tar=tar,
+ prefix=".",
+ skip=lambda entry: entry.name == exclude,
+ path_to_name=lambda path: (top_level_dir / PurePath(path)).as_posix(),
+ )
def __str__(self):
return "VCS: %s" % self.url
diff --git a/lib/spack/spack/test/oci/integration_test.py b/lib/spack/spack/test/oci/integration_test.py
index a2b1ac2f4f..8129dd22cf 100644
--- a/lib/spack/spack/test/oci/integration_test.py
+++ b/lib/spack/spack/test/oci/integration_test.py
@@ -13,11 +13,11 @@ from contextlib import contextmanager
import spack.environment as ev
import spack.oci.opener
-from spack.binary_distribution import gzip_compressed_tarfile
from spack.main import SpackCommand
from spack.oci.image import Digest, ImageReference, default_config, default_manifest
from spack.oci.oci import blob_exists, get_manifest_and_config, upload_blob, upload_manifest
from spack.test.oci.mock_registry import DummyServer, InMemoryOCIRegistry, create_opener
+from spack.util.archive import gzip_compressed_tarfile
buildcache = SpackCommand("buildcache")
mirror = SpackCommand("mirror")
diff --git a/lib/spack/spack/test/util/archive.py b/lib/spack/spack/test/util/archive.py
new file mode 100644
index 0000000000..9688f25924
--- /dev/null
+++ b/lib/spack/spack/test/util/archive.py
@@ -0,0 +1,157 @@
+# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+import gzip
+import hashlib
+import os
+import shutil
+import tarfile
+from pathlib import Path, PurePath
+
+import spack.util.crypto
+from spack.util.archive import gzip_compressed_tarfile, reproducible_tarfile_from_prefix
+
+
+def test_gzip_compressed_tarball_is_reproducible(tmpdir):
+ """Test gzip_compressed_tarfile and reproducible_tarfile_from_prefix for reproducibility"""
+
+ with tmpdir.as_cwd():
+ # Create a few directories
+ root = Path("root")
+ dir_a = root / "a"
+ dir_b = root / "b"
+ root.mkdir(mode=0o777)
+ dir_a.mkdir(mode=0o777)
+ dir_b.mkdir(mode=0o777)
+
+ (root / "y").touch()
+ (root / "x").touch()
+
+ (dir_a / "executable").touch(mode=0o777)
+ (dir_a / "data").touch(mode=0o666)
+ (dir_a / "symlink_file").symlink_to("data")
+ (dir_a / "symlink_dir").symlink_to(PurePath("..", "b"))
+ try:
+ os.link(dir_a / "executable", dir_a / "hardlink")
+ hardlink_support = True
+ except OSError:
+ hardlink_support = False
+
+ (dir_b / "executable").touch(mode=0o777)
+ (dir_b / "data").touch(mode=0o666)
+ (dir_b / "symlink_file").symlink_to("data")
+ (dir_b / "symlink_dir").symlink_to(PurePath("..", "a"))
+
+ # Create the first tarball
+ with gzip_compressed_tarfile("fst.tar.gz") as (tar, gzip_checksum_1, tarfile_checksum_1):
+ reproducible_tarfile_from_prefix(tar, "root")
+
+ # Expected mode for non-dirs is 644 if not executable, 755 if executable. Better to compute
+ # that as we don't know the umask of the user running the test.
+ expected_mode = (
+ lambda name: 0o755 if Path(*name.split("/")).lstat().st_mode & 0o100 else 0o644
+ )
+
+ # Verify the tarball contents
+ with tarfile.open("fst.tar.gz", "r:gz") as tar:
+ # Directories (mode is always 755)
+ for dir in ("root", "root/a", "root/b"):
+ m = tar.getmember(dir)
+ assert m.isdir()
+ assert m.mode == 0o755
+ assert m.uid == m.gid == 0
+ assert m.uname == m.gname == ""
+
+ # Non-executable regular files
+ for file in (
+ "root/x",
+ "root/y",
+ "root/a/data",
+ "root/b/data",
+ "root/a/executable",
+ "root/b/executable",
+ ):
+ m = tar.getmember(file)
+ assert m.isreg()
+ assert m.mode == expected_mode(file)
+ assert m.uid == m.gid == 0
+ assert m.uname == m.gname == ""
+
+ # Symlinks
+ for file in (
+ "root/a/symlink_file",
+ "root/a/symlink_dir",
+ "root/b/symlink_file",
+ "root/b/symlink_dir",
+ ):
+ m = tar.getmember(file)
+ assert m.issym()
+ assert m.mode == 0o755
+ assert m.uid == m.gid == m.mtime == 0
+ assert m.uname == m.gname == ""
+
+ # Verify the symlink targets. Notice that symlink targets are copied verbatim. That
+ # means the value is platform specific for relative symlinks within the current prefix,
+ # as on Windows they'd be ..\a and ..\b instead of ../a and ../b. So, reproducilility
+ # is only guaranteed per-platform currently.
+ assert PurePath(tar.getmember("root/a/symlink_file").linkname) == PurePath("data")
+ assert PurePath(tar.getmember("root/b/symlink_file").linkname) == PurePath("data")
+ assert PurePath(tar.getmember("root/a/symlink_dir").linkname) == PurePath("..", "b")
+ assert PurePath(tar.getmember("root/b/symlink_dir").linkname) == PurePath("..", "a")
+
+ # Check hardlink if supported
+ if hardlink_support:
+ m = tar.getmember("root/a/hardlink")
+ assert m.islnk()
+ assert m.mode == expected_mode("root/a/hardlink")
+ assert m.uid == m.gid == 0
+ assert m.uname == m.gname == ""
+ # Hardlink targets are always in posix format, as they reference a file that exists
+ # in the tarball.
+ assert m.linkname == "root/a/executable"
+
+ # Finally verify if entries are ordered by (is_dir, name)
+ assert [t.name for t in tar.getmembers()] == [
+ "root",
+ "root/x",
+ "root/y",
+ "root/a",
+ "root/a/data",
+ "root/a/executable",
+ *(["root/a/hardlink"] if hardlink_support else []),
+ "root/a/symlink_dir",
+ "root/a/symlink_file",
+ "root/b",
+ "root/b/data",
+ "root/b/executable",
+ "root/b/symlink_dir",
+ "root/b/symlink_file",
+ ]
+
+ # Delete the current root dir, extract the first tarball, create a second
+ shutil.rmtree(root)
+ with tarfile.open("fst.tar.gz", "r:gz") as tar:
+ tar.extractall()
+
+ # Create the second tarball
+ with gzip_compressed_tarfile("snd.tar.gz") as (tar, gzip_checksum_2, tarfile_checksum_2):
+ reproducible_tarfile_from_prefix(tar, "root")
+
+ # Verify the .tar.gz checksums are identical and correct
+ assert (
+ gzip_checksum_1.hexdigest()
+ == gzip_checksum_2.hexdigest()
+ == spack.util.crypto.checksum(hashlib.sha256, "fst.tar.gz")
+ == spack.util.crypto.checksum(hashlib.sha256, "snd.tar.gz")
+ )
+
+ # Verify the .tar checksums are identical and correct
+ with gzip.open("fst.tar.gz", "rb") as f, gzip.open("snd.tar.gz", "rb") as g:
+ assert (
+ tarfile_checksum_1.hexdigest()
+ == tarfile_checksum_2.hexdigest()
+ == spack.util.crypto.checksum_stream(hashlib.sha256, f)
+ == spack.util.crypto.checksum_stream(hashlib.sha256, g)
+ )
diff --git a/lib/spack/spack/util/archive.py b/lib/spack/spack/util/archive.py
new file mode 100644
index 0000000000..8bde40017c
--- /dev/null
+++ b/lib/spack/spack/util/archive.py
@@ -0,0 +1,228 @@
+# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+import errno
+import hashlib
+import io
+import os
+import pathlib
+import tarfile
+from contextlib import closing, contextmanager
+from gzip import GzipFile
+from typing import Callable, Dict, Tuple
+
+
+class ChecksumWriter(io.BufferedIOBase):
+ """Checksum writer computes a checksum while writing to a file."""
+
+ myfileobj = None
+
+ def __init__(self, fileobj, algorithm=hashlib.sha256):
+ self.fileobj = fileobj
+ self.hasher = algorithm()
+ self.length = 0
+
+ def hexdigest(self):
+ return self.hasher.hexdigest()
+
+ def write(self, data):
+ if isinstance(data, (bytes, bytearray)):
+ length = len(data)
+ else:
+ data = memoryview(data)
+ length = data.nbytes
+
+ if length > 0:
+ self.fileobj.write(data)
+ self.hasher.update(data)
+
+ self.length += length
+
+ return length
+
+ def read(self, size=-1):
+ raise OSError(errno.EBADF, "read() on write-only object")
+
+ def read1(self, size=-1):
+ raise OSError(errno.EBADF, "read1() on write-only object")
+
+ def peek(self, n):
+ raise OSError(errno.EBADF, "peek() on write-only object")
+
+ @property
+ def closed(self):
+ return self.fileobj is None
+
+ def close(self):
+ fileobj = self.fileobj
+ if fileobj is None:
+ return
+ self.fileobj.close()
+ self.fileobj = None
+
+ def flush(self):
+ self.fileobj.flush()
+
+ def fileno(self):
+ return self.fileobj.fileno()
+
+ def rewind(self):
+ raise OSError("Can't rewind while computing checksum")
+
+ def readable(self):
+ return False
+
+ def writable(self):
+ return True
+
+ def seekable(self):
+ return True
+
+ def tell(self):
+ return self.fileobj.tell()
+
+ def seek(self, offset, whence=io.SEEK_SET):
+ # In principle forward seek is possible with b"0" padding,
+ # but this is not implemented.
+ if offset == 0 and whence == io.SEEK_CUR:
+ return
+ raise OSError("Can't seek while computing checksum")
+
+ def readline(self, size=-1):
+ raise OSError(errno.EBADF, "readline() on write-only object")
+
+
+@contextmanager
+def gzip_compressed_tarfile(path):
+ """Create a reproducible, gzip compressed tarfile, and keep track of shasums of both the
+ compressed and uncompressed tarfile. Reproduciblity is achived by normalizing the gzip header
+ (no file name and zero mtime).
+
+ Yields a tuple of the following:
+ tarfile.TarFile: tarfile object
+ ChecksumWriter: checksum of the gzip compressed tarfile
+ ChecksumWriter: checksum of the uncompressed tarfile
+ """
+ # Create gzip compressed tarball of the install prefix
+ # 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
+ # If the filename="" is dropped, Python will use fileobj.name instead.
+ # This should effectively mimick `gzip --no-name`.
+ # 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
+ # compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
+ # compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
+ # So we follow gzip.
+ with open(path, "wb") as f, ChecksumWriter(f) as gzip_checksum, closing(
+ GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=gzip_checksum)
+ ) as gzip_file, ChecksumWriter(gzip_file) as tarfile_checksum, tarfile.TarFile(
+ name="", mode="w", fileobj=tarfile_checksum
+ ) as tar:
+ yield tar, gzip_checksum, tarfile_checksum
+
+
+def default_path_to_name(path: str) -> str:
+ """Converts a path to a tarfile name, which uses posix path separators."""
+ p = pathlib.PurePath(path)
+ # Drop the leading slash on posix and the drive letter on windows, and always format as a
+ # posix path.
+ return pathlib.PurePath(*p.parts[1:]).as_posix() if p.is_absolute() else p.as_posix()
+
+
+def reproducible_tarfile_from_prefix(
+ tar: tarfile.TarFile,
+ prefix: str,
+ *,
+ include_parent_directories: bool = False,
+ skip: Callable[[os.DirEntry], bool] = lambda entry: False,
+ path_to_name: Callable[[str], str] = default_path_to_name,
+) -> None:
+ """Create a tarball from a given directory. Only adds regular files, symlinks and dirs.
+ Skips devices, fifos. Preserves hardlinks. Normalizes permissions like git. Tar entries are
+ added in depth-first pre-order, with dir entries partitioned by file | dir, and sorted
+ lexicographically, for reproducibility. Partitioning ensures only one dir is in memory at a
+ time, and sorting improves compression.
+
+ Args:
+ tar: tarfile object opened in write mode
+ prefix: path to directory to tar (either absolute or relative)
+ include_parent_directories: whether to include every directory leading up to ``prefix`` in
+ the tarball
+ skip: function that receives a DirEntry and returns True if the entry should be skipped,
+ whether it is a file or directory. Default implementation does not skip anything.
+ path_to_name: function that converts a path string to a tarfile entry name, which should be
+ in posix format. Not only is it necessary to transform paths in certain cases, such as
+ windows path to posix format, but it can also be used to prepend a directory to each
+ entry even if it does not exist on the filesystem. The default implementation drops the
+ leading slash on posix and the drive letter on windows for absolute paths, and formats
+ as a posix."""
+
+ hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict()
+
+ if include_parent_directories:
+ parent_dirs = reversed(pathlib.Path(prefix).parents)
+ next(parent_dirs) # skip the root: slices are supported from python 3.10
+ for parent_dir in parent_dirs:
+ dir_info = tarfile.TarInfo(path_to_name(str(parent_dir)))
+ dir_info.type = tarfile.DIRTYPE
+ dir_info.mode = 0o755
+ tar.addfile(dir_info)
+
+ dir_stack = [prefix]
+ while dir_stack:
+ dir = dir_stack.pop()
+
+ # Add the dir before its contents
+ dir_info = tarfile.TarInfo(path_to_name(dir))
+ dir_info.type = tarfile.DIRTYPE
+ dir_info.mode = 0o755
+ tar.addfile(dir_info)
+
+ # Sort by name: reproducible & improves compression
+ with os.scandir(dir) as it:
+ entries = sorted(it, key=lambda entry: entry.name)
+
+ new_dirs = []
+ for entry in entries:
+ if skip(entry):
+ continue
+
+ if entry.is_dir(follow_symlinks=False):
+ new_dirs.append(entry.path)
+ continue
+
+ file_info = tarfile.TarInfo(path_to_name(entry.path))
+
+ if entry.is_symlink():
+ file_info.type = tarfile.SYMTYPE
+ file_info.linkname = os.readlink(entry.path)
+ # According to POSIX: "the value of the file mode bits returned in the
+ # st_mode field of the stat structure is unspecified." So we set it to
+ # something sensible without lstat'ing the link.
+ file_info.mode = 0o755
+ tar.addfile(file_info)
+
+ elif entry.is_file(follow_symlinks=False):
+ # entry.stat has zero (st_ino, st_dev, st_nlink) on Windows: use lstat.
+ s = os.lstat(entry.path)
+
+ # Normalize permissions like git
+ file_info.mode = 0o755 if s.st_mode & 0o100 else 0o644
+
+ # Deduplicate hardlinks
+ if s.st_nlink > 1:
+ ident = (s.st_dev, s.st_ino)
+ if ident in hardlink_to_tarinfo_name:
+ file_info.type = tarfile.LNKTYPE
+ file_info.linkname = hardlink_to_tarinfo_name[ident]
+ tar.addfile(file_info)
+ continue
+ hardlink_to_tarinfo_name[ident] = file_info.name
+
+ # If file not yet seen, copy it
+ file_info.type = tarfile.REGTYPE
+ file_info.size = s.st_size
+
+ with open(entry.path, "rb") as f:
+ tar.addfile(file_info, f)
+
+ dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical