From 22d4e7903719b8f4039ea1875e15bd0bf3fe550b Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Wed, 8 Mar 2023 16:51:55 +0100 Subject: buildcache create: reproducible tarballs (#35623) Currently `spack buildcache create` creates compressed tarballs that differ between each invocation, thanks to: 1. The gzip header containing mtime set to time.time() 2. The generated buildinfo file which has a different mtime every time. To avoid this, you have to explicitly construct GZipFile yourself, since the Python API doesn't expose the mtime arg, and we have to manually create the tarinfo object for the buildinfo metadata file. Normalize mode: regular files & hardlinks executable by user, dirs, symlinks: set 0o755 permissions in tarfile; other files use 0o644 --- lib/spack/spack/binary_distribution.py | 147 ++++++++++++++++++++++----------- lib/spack/spack/test/bindist.py | 80 ++++++++++++++++++ 2 files changed, 177 insertions(+), 50 deletions(-) diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py index 9758e8b9ea..38a3333475 100644 --- a/lib/spack/spack/binary_distribution.py +++ b/lib/spack/spack/binary_distribution.py @@ -6,6 +6,8 @@ import codecs import collections import hashlib +import io +import itertools import json import multiprocessing.pool import os @@ -20,7 +22,8 @@ import urllib.error import urllib.parse import urllib.request import warnings -from contextlib import closing +from contextlib import closing, contextmanager +from gzip import GzipFile from urllib.error import HTTPError, URLError import ruamel.yaml as yaml @@ -739,34 +742,31 @@ def get_buildfile_manifest(spec): return data -def write_buildinfo_file(spec, workdir, rel=False): - """ - Create a cache file containing information - required for the relocation - """ - manifest = get_buildfile_manifest(spec) +def prefixes_to_hashes(spec): + return { + str(s.prefix): s.dag_hash() + for s in itertools.chain( + spec.traverse(root=True, deptype="link"), spec.dependencies(deptype="run") + ) + } - prefix_to_hash = dict() - prefix_to_hash[str(spec.package.prefix)] = spec.dag_hash() - deps = spack.build_environment.get_rpath_deps(spec.package) - for d in deps + spec.dependencies(deptype="run"): - prefix_to_hash[str(d.prefix)] = d.dag_hash() - # Create buildinfo data and write it to disk - buildinfo = {} - buildinfo["sbang_install_path"] = spack.hooks.sbang.sbang_install_path() - buildinfo["relative_rpaths"] = rel - buildinfo["buildpath"] = spack.store.layout.root - buildinfo["spackprefix"] = spack.paths.prefix - buildinfo["relative_prefix"] = os.path.relpath(spec.prefix, spack.store.layout.root) - buildinfo["relocate_textfiles"] = manifest["text_to_relocate"] - buildinfo["relocate_binaries"] = manifest["binary_to_relocate"] - buildinfo["relocate_links"] = manifest["link_to_relocate"] - buildinfo["hardlinks_deduped"] = manifest["hardlinks_deduped"] - buildinfo["prefix_to_hash"] = prefix_to_hash - filename = buildinfo_file_name(workdir) - with open(filename, "w") as outfile: - outfile.write(syaml.dump(buildinfo, default_flow_style=True)) +def get_buildinfo_dict(spec, rel=False): + """Create metadata for a tarball""" + manifest = get_buildfile_manifest(spec) + + return { + "sbang_install_path": spack.hooks.sbang.sbang_install_path(), + "relative_rpaths": rel, + "buildpath": spack.store.layout.root, + "spackprefix": spack.paths.prefix, + "relative_prefix": os.path.relpath(spec.prefix, spack.store.layout.root), + "relocate_textfiles": manifest["text_to_relocate"], + "relocate_binaries": manifest["binary_to_relocate"], + "relocate_links": manifest["link_to_relocate"], + "hardlinks_deduped": manifest["hardlinks_deduped"], + "prefix_to_hash": prefixes_to_hashes(spec), + } def tarball_directory_name(spec): @@ -1139,6 +1139,68 @@ def generate_key_index(key_prefix, tmpdir=None): shutil.rmtree(tmpdir) +@contextmanager +def gzip_compressed_tarfile(path): + """Create a reproducible, compressed tarfile""" + # Create gzip compressed tarball of the install prefix + # 1) Use explicit empty filename and mtime 0 for gzip header reproducibility. + # If the filename="" is dropped, Python will use fileobj.name instead. + # This should effectively mimick `gzip --no-name`. + # 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed: + # compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB + # compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB + # So we follow gzip. + with open(path, "wb") as fileobj, closing( + GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=fileobj) + ) as gzip_file, tarfile.TarFile(name="", mode="w", fileobj=gzip_file) as tar: + yield tar + + +def deterministic_tarinfo(tarinfo: tarfile.TarInfo): + # We only add files, symlinks, hardlinks, and directories + # No character devices, block devices and FIFOs should ever enter a tarball. + if tarinfo.isdev(): + return None + + # For distribution, it makes no sense to user/group data; since (a) they don't exist + # on other machines, and (b) they lead to surprises as `tar x` run as root will change + # ownership if it can. We want to extract as the current user. By setting owner to root, + # root will extract as root, and non-privileged user will extract as themselves. + tarinfo.uid = 0 + tarinfo.gid = 0 + tarinfo.uname = "" + tarinfo.gname = "" + + # Reset mtime to epoch time, our prefixes are not truly immutable, so files may get + # touched; as long as the content does not change, this ensures we get stable tarballs. + tarinfo.mtime = 0 + + # Normalize mode + if tarinfo.isfile() or tarinfo.islnk(): + # If user can execute, use 0o755; else 0o644 + # This is to avoid potentially unsafe world writable & exeutable files that may get + # extracted when Python or tar is run with privileges + tarinfo.mode = 0o644 if tarinfo.mode & 0o100 == 0 else 0o755 + else: # symbolic link and directories + tarinfo.mode = 0o755 + + return tarinfo + + +def tar_add_metadata(tar: tarfile.TarFile, path: str, data: dict): + # Serialize buildinfo for the tarball + bstring = syaml.dump(data, default_flow_style=True).encode("utf-8") + tarinfo = tarfile.TarInfo(name=path) + tarinfo.size = len(bstring) + tar.addfile(deterministic_tarinfo(tarinfo), io.BytesIO(bstring)) + + +def _do_create_tarball(tarfile_path, binaries_dir, pkg_dir, buildinfo): + with gzip_compressed_tarfile(tarfile_path) as tar: + tar.add(name=binaries_dir, arcname=pkg_dir, filter=deterministic_tarinfo) + tar_add_metadata(tar, buildinfo_file_name(pkg_dir), buildinfo) + + def _build_tarball( spec, out_url, @@ -1217,39 +1279,26 @@ def _build_tarball( os.remove(temp_tarfile_path) else: binaries_dir = spec.prefix - mkdirp(os.path.join(workdir, ".spack")) # create info for later relocation and create tar - write_buildinfo_file(spec, workdir, relative) + buildinfo = get_buildinfo_dict(spec, relative) # optionally make the paths in the binaries relative to each other # in the spack install tree before creating tarball try: if relative: - make_package_relative(workdir, spec, allow_root) + make_package_relative(workdir, spec, buildinfo, allow_root) elif not allow_root: - ensure_package_relocatable(workdir, binaries_dir) + ensure_package_relocatable(buildinfo, binaries_dir) except Exception as e: - shutil.rmtree(workdir) - shutil.rmtree(tarfile_dir) shutil.rmtree(tmpdir) tty.die(e) - # create gzip compressed tarball of the install prefix - # On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed: - # compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB - # compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB - # So we follow gzip. - with closing(tarfile.open(tarfile_path, "w:gz", compresslevel=6)) as tar: - tar.add(name=binaries_dir, arcname=pkg_dir) - if not relative: - # Add buildinfo file - buildinfo_path = buildinfo_file_name(workdir) - buildinfo_arcname = buildinfo_file_name(pkg_dir) - tar.add(name=buildinfo_path, arcname=buildinfo_arcname) + _do_create_tarball(tarfile_path, binaries_dir, pkg_dir, buildinfo) # remove copy of install directory - shutil.rmtree(workdir) + if relative: + shutil.rmtree(workdir) # get the sha256 checksum of the tarball checksum = checksum_tarball(tarfile_path) @@ -1536,13 +1585,12 @@ def download_tarball(spec, unsigned=False, mirrors_for_spec=None): return None -def make_package_relative(workdir, spec, allow_root): +def make_package_relative(workdir, spec, buildinfo, allow_root): """ Change paths in binaries to relative paths. Change absolute symlinks to relative symlinks. """ prefix = spec.prefix - buildinfo = read_buildinfo_file(workdir) old_layout_root = buildinfo["buildpath"] orig_path_names = list() cur_path_names = list() @@ -1566,9 +1614,8 @@ def make_package_relative(workdir, spec, allow_root): relocate.make_link_relative(cur_path_names, orig_path_names) -def ensure_package_relocatable(workdir, binaries_dir): +def ensure_package_relocatable(buildinfo, binaries_dir): """Check if package binaries are relocatable.""" - buildinfo = read_buildinfo_file(workdir) binaries = [os.path.join(binaries_dir, f) for f in buildinfo["relocate_binaries"]] relocate.ensure_binaries_are_relocatable(binaries) diff --git a/lib/spack/spack/test/bindist.py b/lib/spack/spack/test/bindist.py index 81f5f7377c..cde16c241e 100644 --- a/lib/spack/spack/test/bindist.py +++ b/lib/spack/spack/test/bindist.py @@ -2,11 +2,13 @@ # Spack Project Developers. See the top-level COPYRIGHT file for details. # # SPDX-License-Identifier: (Apache-2.0 OR MIT) +import filecmp import glob import io import os import platform import sys +import tarfile import urllib.error import urllib.request import urllib.response @@ -952,3 +954,81 @@ def test_correct_specs_are_pushed( bindist.push([spec], push_url, include_root=root, include_dependencies=deps) assert packages_to_push == expected + + +def test_reproducible_tarball_is_reproducible(tmpdir): + p = tmpdir.mkdir("prefix") + p.mkdir("bin") + p.mkdir(".spack") + + app = p.join("bin", "app") + + tarball_1 = str(tmpdir.join("prefix-1.tar.gz")) + tarball_2 = str(tmpdir.join("prefix-2.tar.gz")) + + with open(app, "w") as f: + f.write("hello world") + + buildinfo = {"metadata": "yes please"} + + # Create a tarball with a certain mtime of bin/app + os.utime(app, times=(0, 0)) + bindist._do_create_tarball(tarball_1, binaries_dir=p, pkg_dir="pkg", buildinfo=buildinfo) + + # Do it another time with different mtime of bin/app + os.utime(app, times=(10, 10)) + bindist._do_create_tarball(tarball_2, binaries_dir=p, pkg_dir="pkg", buildinfo=buildinfo) + + # They should be bitwise identical: + assert filecmp.cmp(tarball_1, tarball_2, shallow=False) + + # Sanity check for contents: + with tarfile.open(tarball_1, mode="r") as f: + for m in f.getmembers(): + assert m.uid == m.gid == m.mtime == 0 + assert m.uname == m.gname == "" + + assert set(f.getnames()) == { + "pkg", + "pkg/bin", + "pkg/bin/app", + "pkg/.spack", + "pkg/.spack/binary_distribution", + } + + +def test_tarball_normalized_permissions(tmpdir): + p = tmpdir.mkdir("prefix") + p.mkdir("bin") + p.mkdir("share") + p.mkdir(".spack") + + app = p.join("bin", "app") + data = p.join("share", "file") + tarball = str(tmpdir.join("prefix.tar.gz")) + + # Everyone can write & execute. This should turn into 0o755 when the tarball is + # extracted (on a different system). + with open(app, "w", opener=lambda path, flags: os.open(path, flags, 0o777)) as f: + f.write("hello world") + + # User doesn't have execute permissions, but group/world have; this should also + # turn into 0o644 (user read/write, group&world only read). + with open(data, "w", opener=lambda path, flags: os.open(path, flags, 0o477)) as f: + f.write("hello world") + + bindist._do_create_tarball(tarball, binaries_dir=p, pkg_dir="pkg", buildinfo={}) + + with tarfile.open(tarball) as tar: + path_to_member = {member.name: member for member in tar.getmembers()} + + # directories should have 0o755 + assert path_to_member["pkg"].mode == 0o755 + assert path_to_member["pkg/bin"].mode == 0o755 + assert path_to_member["pkg/.spack"].mode == 0o755 + + # executable-by-user files should be 0o755 + assert path_to_member["pkg/bin/app"].mode == 0o755 + + # not-executable-by-user files should be 0o644 + assert path_to_member["pkg/share/file"].mode == 0o644 -- cgit v1.2.3-60-g2f50