diff options
author | Harmen Stoppels <harmenstoppels@gmail.com> | 2023-10-03 09:56:18 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-03 09:56:18 +0200 |
commit | 06057d6dbac34a1f775f6cab1ab88ee8e145087f (patch) | |
tree | bf3aa5e04236e87bbfc78df210c12a3db76f8f8e /lib | |
parent | bb03a1768b317607204ae05e66485ce7f7398a6f (diff) | |
download | spack-06057d6dbac34a1f775f6cab1ab88ee8e145087f.tar.gz spack-06057d6dbac34a1f775f6cab1ab88ee8e145087f.tar.bz2 spack-06057d6dbac34a1f775f6cab1ab88ee8e145087f.tar.xz spack-06057d6dbac34a1f775f6cab1ab88ee8e145087f.zip |
Buildcache tarballs with rootfs structure (#39341)
Two changes in this PR:
1. Register absolute paths in tarballs, which makes it easier
to use them as container image layers, or rootfs in general, outside
of Spack. Spack supports this already on develop.
2. Assemble the tarfile entries "by hand", which has a few advantages:
1. Avoid reading `/etc/passwd`, `/etc/groups`, `/etc/nsswitch.conf`
which `tar.add(dir)` does _for each file it adds_
2. Reduce the number of stat calls per file added by a factor two,
compared to `tar.add`, which should help with slow, shared filesystems
where these calls are expensive
4. Create normalized `TarInfo` entries from the start, instead of letting
Python create them and patching them after the fact
5. Don't recurse into subdirs before processing files, to avoid
keeping nested directories opened. (this changes the tar entry
order slightly, it's like sorting by `(not is_dir, name)`.
Diffstat (limited to 'lib')
-rw-r--r-- | lib/spack/spack/binary_distribution.py | 143 | ||||
-rw-r--r-- | lib/spack/spack/test/bindist.py | 92 |
2 files changed, 162 insertions, 73 deletions
diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py index 90cd577fe8..a9b1d6280b 100644 --- a/lib/spack/spack/binary_distribution.py +++ b/lib/spack/spack/binary_distribution.py @@ -23,7 +23,7 @@ import urllib.request import warnings from contextlib import closing, contextmanager from gzip import GzipFile -from typing import List, NamedTuple, Optional, Union +from typing import Dict, List, NamedTuple, Optional, Tuple, Union from urllib.error import HTTPError, URLError import llnl.util.filesystem as fsys @@ -625,8 +625,7 @@ def buildinfo_file_name(prefix): """ Filename of the binary package meta-data file """ - name = os.path.join(prefix, ".spack/binary_distribution") - return name + return os.path.join(prefix, ".spack/binary_distribution") def read_buildinfo_file(prefix): @@ -1158,57 +1157,99 @@ def gzip_compressed_tarfile(path): yield tar -def deterministic_tarinfo(tarinfo: tarfile.TarInfo): - # We only add files, symlinks, hardlinks, and directories - # No character devices, block devices and FIFOs should ever enter a tarball. - if tarinfo.isdev(): - return None +def _tarinfo_name(p: str): + return p.lstrip("/") - # For distribution, it makes no sense to user/group data; since (a) they don't exist - # on other machines, and (b) they lead to surprises as `tar x` run as root will change - # ownership if it can. We want to extract as the current user. By setting owner to root, - # root will extract as root, and non-privileged user will extract as themselves. - tarinfo.uid = 0 - tarinfo.gid = 0 - tarinfo.uname = "" - tarinfo.gname = "" - - # Reset mtime to epoch time, our prefixes are not truly immutable, so files may get - # touched; as long as the content does not change, this ensures we get stable tarballs. - tarinfo.mtime = 0 - - # Normalize mode - if tarinfo.isfile() or tarinfo.islnk(): - # If user can execute, use 0o755; else 0o644 - # This is to avoid potentially unsafe world writable & exeutable files that may get - # extracted when Python or tar is run with privileges - tarinfo.mode = 0o644 if tarinfo.mode & 0o100 == 0 else 0o755 - else: # symbolic link and directories - tarinfo.mode = 0o755 - - return tarinfo - - -def tar_add_metadata(tar: tarfile.TarFile, path: str, data: dict): - # Serialize buildinfo for the tarball - bstring = syaml.dump(data, default_flow_style=True).encode("utf-8") - tarinfo = tarfile.TarInfo(name=path) - tarinfo.size = len(bstring) - tar.addfile(deterministic_tarinfo(tarinfo), io.BytesIO(bstring)) - - -def deterministic_tarinfo_without_buildinfo(tarinfo: tarfile.TarInfo): - """Skip buildinfo file when creating a tarball, and normalize other tarinfo fields.""" - if tarinfo.name.endswith("/.spack/binary_distribution"): - return None - return deterministic_tarinfo(tarinfo) +def tarfile_of_spec_prefix(tar: tarfile.TarFile, prefix: str) -> None: + """Create a tarfile of an install prefix of a spec. Skips existing buildinfo file. + Only adds regular files, symlinks and dirs. Skips devices, fifos. Preserves hardlinks. + Normalizes permissions like git. Tar entries are added in depth-first pre-order, with + dir entries partitioned by file | dir, and sorted alphabetically, for reproducibility. + Partitioning ensures only one dir is in memory at a time, and sorting improves compression. + + Args: + tar: tarfile object to add files to + prefix: absolute install prefix of spec""" + if not os.path.isabs(prefix) or not os.path.isdir(prefix): + raise ValueError(f"prefix '{prefix}' must be an absolute path to a directory") + hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict() + stat_key = lambda stat: (stat.st_dev, stat.st_ino) + + try: # skip buildinfo file if it exists + files_to_skip = [stat_key(os.lstat(buildinfo_file_name(prefix)))] + except OSError: + files_to_skip = [] + + dir_stack = [prefix] + while dir_stack: + dir = dir_stack.pop() + + # Add the dir before its contents + dir_info = tarfile.TarInfo(_tarinfo_name(dir)) + dir_info.type = tarfile.DIRTYPE + dir_info.mode = 0o755 + tar.addfile(dir_info) + + # Sort by name: reproducible & improves compression + with os.scandir(dir) as it: + entries = sorted(it, key=lambda entry: entry.name) + + new_dirs = [] + for entry in entries: + if entry.is_dir(follow_symlinks=False): + new_dirs.append(entry.path) + continue + + file_info = tarfile.TarInfo(_tarinfo_name(entry.path)) + + s = entry.stat(follow_symlinks=False) + + # Skip existing binary distribution files. + id = stat_key(s) + if id in files_to_skip: + continue + + # Normalize the mode + file_info.mode = 0o644 if s.st_mode & 0o100 == 0 else 0o755 + + if entry.is_symlink(): + file_info.type = tarfile.SYMTYPE + file_info.linkname = os.readlink(entry.path) + tar.addfile(file_info) + + elif entry.is_file(follow_symlinks=False): + # Deduplicate hardlinks + if s.st_nlink > 1: + if id in hardlink_to_tarinfo_name: + file_info.type = tarfile.LNKTYPE + file_info.linkname = hardlink_to_tarinfo_name[id] + tar.addfile(file_info) + continue + hardlink_to_tarinfo_name[id] = file_info.name + + # If file not yet seen, copy it. + file_info.type = tarfile.REGTYPE + file_info.size = s.st_size + with open(entry.path, "rb") as f: + tar.addfile(file_info, f) -def _do_create_tarball(tarfile_path: str, binaries_dir: str, pkg_dir: str, buildinfo: dict): + dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical + + +def _do_create_tarball(tarfile_path: str, binaries_dir: str, buildinfo: dict): with gzip_compressed_tarfile(tarfile_path) as tar: - tar.add(name=binaries_dir, arcname=pkg_dir, filter=deterministic_tarinfo_without_buildinfo) - tar_add_metadata(tar, buildinfo_file_name(pkg_dir), buildinfo) + # Tarball the install prefix + tarfile_of_spec_prefix(tar, binaries_dir) + + # Serialize buildinfo for the tarball + bstring = syaml.dump(buildinfo, default_flow_style=True).encode("utf-8") + tarinfo = tarfile.TarInfo(name=_tarinfo_name(buildinfo_file_name(binaries_dir))) + tarinfo.type = tarfile.REGTYPE + tarinfo.size = len(bstring) + tarinfo.mode = 0o644 + tar.addfile(tarinfo, io.BytesIO(bstring)) class PushOptions(NamedTuple): @@ -1280,14 +1321,12 @@ def _build_tarball_in_stage_dir(spec: Spec, out_url: str, stage_dir: str, option ): raise NoOverwriteException(url_util.format(remote_specfile_path)) - pkg_dir = os.path.basename(spec.prefix.rstrip(os.path.sep)) - binaries_dir = spec.prefix # create info for later relocation and create tar buildinfo = get_buildinfo_dict(spec) - _do_create_tarball(tarfile_path, binaries_dir, pkg_dir, buildinfo) + _do_create_tarball(tarfile_path, binaries_dir, buildinfo) # get the sha256 checksum of the tarball checksum = checksum_tarball(tarfile_path) diff --git a/lib/spack/spack/test/bindist.py b/lib/spack/spack/test/bindist.py index ea065c26a5..20802bbdd8 100644 --- a/lib/spack/spack/test/bindist.py +++ b/lib/spack/spack/test/bindist.py @@ -899,22 +899,21 @@ def test_tarball_doesnt_include_buildinfo_twice(tmpdir): tarball = str(tmpdir.join("prefix.tar.gz")) bindist._do_create_tarball( - tarfile_path=tarball, - binaries_dir=str(p), - pkg_dir="my-pkg-prefix", - buildinfo={"metadata": "new"}, + tarfile_path=tarball, binaries_dir=p.strpath, buildinfo={"metadata": "new"} ) + expected_prefix = p.strpath.lstrip("/") + # Verify we don't have a repeated binary_distribution file, # and that the tarball contains the new one, not the old one. with tarfile.open(tarball) as tar: - assert syaml.load(tar.extractfile("my-pkg-prefix/.spack/binary_distribution")) == { + assert syaml.load(tar.extractfile(f"{expected_prefix}/.spack/binary_distribution")) == { "metadata": "new" } assert tar.getnames() == [ - "my-pkg-prefix", - "my-pkg-prefix/.spack", - "my-pkg-prefix/.spack/binary_distribution", + f"{expected_prefix}", + f"{expected_prefix}/.spack", + f"{expected_prefix}/.spack/binary_distribution", ] @@ -935,15 +934,17 @@ def test_reproducible_tarball_is_reproducible(tmpdir): # Create a tarball with a certain mtime of bin/app os.utime(app, times=(0, 0)) - bindist._do_create_tarball(tarball_1, binaries_dir=p, pkg_dir="pkg", buildinfo=buildinfo) + bindist._do_create_tarball(tarball_1, binaries_dir=p.strpath, buildinfo=buildinfo) # Do it another time with different mtime of bin/app os.utime(app, times=(10, 10)) - bindist._do_create_tarball(tarball_2, binaries_dir=p, pkg_dir="pkg", buildinfo=buildinfo) + bindist._do_create_tarball(tarball_2, binaries_dir=p.strpath, buildinfo=buildinfo) # They should be bitwise identical: assert filecmp.cmp(tarball_1, tarball_2, shallow=False) + expected_prefix = p.strpath.lstrip("/") + # Sanity check for contents: with tarfile.open(tarball_1, mode="r") as f: for m in f.getmembers(): @@ -951,11 +952,11 @@ def test_reproducible_tarball_is_reproducible(tmpdir): assert m.uname == m.gname == "" assert set(f.getnames()) == { - "pkg", - "pkg/bin", - "pkg/bin/app", - "pkg/.spack", - "pkg/.spack/binary_distribution", + f"{expected_prefix}", + f"{expected_prefix}/bin", + f"{expected_prefix}/bin/app", + f"{expected_prefix}/.spack", + f"{expected_prefix}/.spack/binary_distribution", } @@ -979,21 +980,23 @@ def test_tarball_normalized_permissions(tmpdir): with open(data, "w", opener=lambda path, flags: os.open(path, flags, 0o477)) as f: f.write("hello world") - bindist._do_create_tarball(tarball, binaries_dir=p, pkg_dir="pkg", buildinfo={}) + bindist._do_create_tarball(tarball, binaries_dir=p.strpath, buildinfo={}) + + expected_prefix = p.strpath.lstrip("/") with tarfile.open(tarball) as tar: path_to_member = {member.name: member for member in tar.getmembers()} # directories should have 0o755 - assert path_to_member["pkg"].mode == 0o755 - assert path_to_member["pkg/bin"].mode == 0o755 - assert path_to_member["pkg/.spack"].mode == 0o755 + assert path_to_member[f"{expected_prefix}"].mode == 0o755 + assert path_to_member[f"{expected_prefix}/bin"].mode == 0o755 + assert path_to_member[f"{expected_prefix}/.spack"].mode == 0o755 # executable-by-user files should be 0o755 - assert path_to_member["pkg/bin/app"].mode == 0o755 + assert path_to_member[f"{expected_prefix}/bin/app"].mode == 0o755 # not-executable-by-user files should be 0o644 - assert path_to_member["pkg/share/file"].mode == 0o644 + assert path_to_member[f"{expected_prefix}/share/file"].mode == 0o644 def test_tarball_common_prefix(dummy_prefix, tmpdir): @@ -1062,3 +1065,50 @@ def test_tarfile_with_files_outside_common_prefix(tmpdir, dummy_prefix): ValueError, match="Tarball contains file /etc/config_file outside of prefix" ): bindist._ensure_common_prefix(tarfile.open("broken.tar", mode="r")) + + +def test_tarfile_of_spec_prefix(tmpdir): + """Tests whether hardlinks, symlinks, files and dirs are added correctly, + and that the order of entries is correct.""" + prefix = tmpdir.mkdir("prefix") + prefix.ensure("a_directory", dir=True).join("file").write("hello") + prefix.ensure("c_directory", dir=True).join("file").write("hello") + prefix.ensure("b_directory", dir=True).join("file").write("hello") + prefix.join("file").write("hello") + os.symlink(prefix.join("file"), prefix.join("symlink")) + os.link(prefix.join("file"), prefix.join("hardlink")) + + file = tmpdir.join("example.tar") + + with tarfile.open(file, mode="w") as tar: + bindist.tarfile_of_spec_prefix(tar, prefix.strpath) + + expected_prefix = prefix.strpath.lstrip("/") + + with tarfile.open(file, mode="r") as tar: + # Verify that entries are added in depth-first pre-order, files preceding dirs, + # entries ordered alphabetically + assert tar.getnames() == [ + f"{expected_prefix}", + f"{expected_prefix}/file", + f"{expected_prefix}/hardlink", + f"{expected_prefix}/symlink", + f"{expected_prefix}/a_directory", + f"{expected_prefix}/a_directory/file", + f"{expected_prefix}/b_directory", + f"{expected_prefix}/b_directory/file", + f"{expected_prefix}/c_directory", + f"{expected_prefix}/c_directory/file", + ] + + # Check that the types are all correct + assert tar.getmember(f"{expected_prefix}").isdir() + assert tar.getmember(f"{expected_prefix}/file").isreg() + assert tar.getmember(f"{expected_prefix}/hardlink").islnk() + assert tar.getmember(f"{expected_prefix}/symlink").issym() + assert tar.getmember(f"{expected_prefix}/a_directory").isdir() + assert tar.getmember(f"{expected_prefix}/a_directory/file").isreg() + assert tar.getmember(f"{expected_prefix}/b_directory").isdir() + assert tar.getmember(f"{expected_prefix}/b_directory/file").isreg() + assert tar.getmember(f"{expected_prefix}/c_directory").isdir() + assert tar.getmember(f"{expected_prefix}/c_directory/file").isreg() |