summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorHarmen Stoppels <harmenstoppels@gmail.com>2023-03-08 16:51:55 +0100
committerGitHub <noreply@github.com>2023-03-08 15:51:55 +0000
commit22d4e7903719b8f4039ea1875e15bd0bf3fe550b (patch)
tree1a8e7cb138d408f9894e9d9aa6a103a53ac76bb2 /lib
parent2777ca83eb3e90ea8ef2f92208dd8dbf67cba404 (diff)
downloadspack-22d4e7903719b8f4039ea1875e15bd0bf3fe550b.tar.gz
spack-22d4e7903719b8f4039ea1875e15bd0bf3fe550b.tar.bz2
spack-22d4e7903719b8f4039ea1875e15bd0bf3fe550b.tar.xz
spack-22d4e7903719b8f4039ea1875e15bd0bf3fe550b.zip
buildcache create: reproducible tarballs (#35623)
Currently `spack buildcache create` creates compressed tarballs that differ between each invocation, thanks to: 1. The gzip header containing mtime set to time.time() 2. The generated buildinfo file which has a different mtime every time. To avoid this, you have to explicitly construct GZipFile yourself, since the Python API doesn't expose the mtime arg, and we have to manually create the tarinfo object for the buildinfo metadata file. Normalize mode: regular files & hardlinks executable by user, dirs, symlinks: set 0o755 permissions in tarfile; other files use 0o644
Diffstat (limited to 'lib')
-rw-r--r--lib/spack/spack/binary_distribution.py147
-rw-r--r--lib/spack/spack/test/bindist.py80
2 files changed, 177 insertions, 50 deletions
diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py
index 9758e8b9ea..38a3333475 100644
--- a/lib/spack/spack/binary_distribution.py
+++ b/lib/spack/spack/binary_distribution.py
@@ -6,6 +6,8 @@
import codecs
import collections
import hashlib
+import io
+import itertools
import json
import multiprocessing.pool
import os
@@ -20,7 +22,8 @@ import urllib.error
import urllib.parse
import urllib.request
import warnings
-from contextlib import closing
+from contextlib import closing, contextmanager
+from gzip import GzipFile
from urllib.error import HTTPError, URLError
import ruamel.yaml as yaml
@@ -739,34 +742,31 @@ def get_buildfile_manifest(spec):
return data
-def write_buildinfo_file(spec, workdir, rel=False):
- """
- Create a cache file containing information
- required for the relocation
- """
- manifest = get_buildfile_manifest(spec)
+def prefixes_to_hashes(spec):
+ return {
+ str(s.prefix): s.dag_hash()
+ for s in itertools.chain(
+ spec.traverse(root=True, deptype="link"), spec.dependencies(deptype="run")
+ )
+ }
- prefix_to_hash = dict()
- prefix_to_hash[str(spec.package.prefix)] = spec.dag_hash()
- deps = spack.build_environment.get_rpath_deps(spec.package)
- for d in deps + spec.dependencies(deptype="run"):
- prefix_to_hash[str(d.prefix)] = d.dag_hash()
- # Create buildinfo data and write it to disk
- buildinfo = {}
- buildinfo["sbang_install_path"] = spack.hooks.sbang.sbang_install_path()
- buildinfo["relative_rpaths"] = rel
- buildinfo["buildpath"] = spack.store.layout.root
- buildinfo["spackprefix"] = spack.paths.prefix
- buildinfo["relative_prefix"] = os.path.relpath(spec.prefix, spack.store.layout.root)
- buildinfo["relocate_textfiles"] = manifest["text_to_relocate"]
- buildinfo["relocate_binaries"] = manifest["binary_to_relocate"]
- buildinfo["relocate_links"] = manifest["link_to_relocate"]
- buildinfo["hardlinks_deduped"] = manifest["hardlinks_deduped"]
- buildinfo["prefix_to_hash"] = prefix_to_hash
- filename = buildinfo_file_name(workdir)
- with open(filename, "w") as outfile:
- outfile.write(syaml.dump(buildinfo, default_flow_style=True))
+def get_buildinfo_dict(spec, rel=False):
+ """Create metadata for a tarball"""
+ manifest = get_buildfile_manifest(spec)
+
+ return {
+ "sbang_install_path": spack.hooks.sbang.sbang_install_path(),
+ "relative_rpaths": rel,
+ "buildpath": spack.store.layout.root,
+ "spackprefix": spack.paths.prefix,
+ "relative_prefix": os.path.relpath(spec.prefix, spack.store.layout.root),
+ "relocate_textfiles": manifest["text_to_relocate"],
+ "relocate_binaries": manifest["binary_to_relocate"],
+ "relocate_links": manifest["link_to_relocate"],
+ "hardlinks_deduped": manifest["hardlinks_deduped"],
+ "prefix_to_hash": prefixes_to_hashes(spec),
+ }
def tarball_directory_name(spec):
@@ -1139,6 +1139,68 @@ def generate_key_index(key_prefix, tmpdir=None):
shutil.rmtree(tmpdir)
+@contextmanager
+def gzip_compressed_tarfile(path):
+ """Create a reproducible, compressed tarfile"""
+ # Create gzip compressed tarball of the install prefix
+ # 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
+ # If the filename="" is dropped, Python will use fileobj.name instead.
+ # This should effectively mimick `gzip --no-name`.
+ # 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
+ # compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
+ # compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
+ # So we follow gzip.
+ with open(path, "wb") as fileobj, closing(
+ GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=fileobj)
+ ) as gzip_file, tarfile.TarFile(name="", mode="w", fileobj=gzip_file) as tar:
+ yield tar
+
+
+def deterministic_tarinfo(tarinfo: tarfile.TarInfo):
+ # We only add files, symlinks, hardlinks, and directories
+ # No character devices, block devices and FIFOs should ever enter a tarball.
+ if tarinfo.isdev():
+ return None
+
+ # For distribution, it makes no sense to user/group data; since (a) they don't exist
+ # on other machines, and (b) they lead to surprises as `tar x` run as root will change
+ # ownership if it can. We want to extract as the current user. By setting owner to root,
+ # root will extract as root, and non-privileged user will extract as themselves.
+ tarinfo.uid = 0
+ tarinfo.gid = 0
+ tarinfo.uname = ""
+ tarinfo.gname = ""
+
+ # Reset mtime to epoch time, our prefixes are not truly immutable, so files may get
+ # touched; as long as the content does not change, this ensures we get stable tarballs.
+ tarinfo.mtime = 0
+
+ # Normalize mode
+ if tarinfo.isfile() or tarinfo.islnk():
+ # If user can execute, use 0o755; else 0o644
+ # This is to avoid potentially unsafe world writable & exeutable files that may get
+ # extracted when Python or tar is run with privileges
+ tarinfo.mode = 0o644 if tarinfo.mode & 0o100 == 0 else 0o755
+ else: # symbolic link and directories
+ tarinfo.mode = 0o755
+
+ return tarinfo
+
+
+def tar_add_metadata(tar: tarfile.TarFile, path: str, data: dict):
+ # Serialize buildinfo for the tarball
+ bstring = syaml.dump(data, default_flow_style=True).encode("utf-8")
+ tarinfo = tarfile.TarInfo(name=path)
+ tarinfo.size = len(bstring)
+ tar.addfile(deterministic_tarinfo(tarinfo), io.BytesIO(bstring))
+
+
+def _do_create_tarball(tarfile_path, binaries_dir, pkg_dir, buildinfo):
+ with gzip_compressed_tarfile(tarfile_path) as tar:
+ tar.add(name=binaries_dir, arcname=pkg_dir, filter=deterministic_tarinfo)
+ tar_add_metadata(tar, buildinfo_file_name(pkg_dir), buildinfo)
+
+
def _build_tarball(
spec,
out_url,
@@ -1217,39 +1279,26 @@ def _build_tarball(
os.remove(temp_tarfile_path)
else:
binaries_dir = spec.prefix
- mkdirp(os.path.join(workdir, ".spack"))
# create info for later relocation and create tar
- write_buildinfo_file(spec, workdir, relative)
+ buildinfo = get_buildinfo_dict(spec, relative)
# optionally make the paths in the binaries relative to each other
# in the spack install tree before creating tarball
try:
if relative:
- make_package_relative(workdir, spec, allow_root)
+ make_package_relative(workdir, spec, buildinfo, allow_root)
elif not allow_root:
- ensure_package_relocatable(workdir, binaries_dir)
+ ensure_package_relocatable(buildinfo, binaries_dir)
except Exception as e:
- shutil.rmtree(workdir)
- shutil.rmtree(tarfile_dir)
shutil.rmtree(tmpdir)
tty.die(e)
- # create gzip compressed tarball of the install prefix
- # On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
- # compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
- # compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
- # So we follow gzip.
- with closing(tarfile.open(tarfile_path, "w:gz", compresslevel=6)) as tar:
- tar.add(name=binaries_dir, arcname=pkg_dir)
- if not relative:
- # Add buildinfo file
- buildinfo_path = buildinfo_file_name(workdir)
- buildinfo_arcname = buildinfo_file_name(pkg_dir)
- tar.add(name=buildinfo_path, arcname=buildinfo_arcname)
+ _do_create_tarball(tarfile_path, binaries_dir, pkg_dir, buildinfo)
# remove copy of install directory
- shutil.rmtree(workdir)
+ if relative:
+ shutil.rmtree(workdir)
# get the sha256 checksum of the tarball
checksum = checksum_tarball(tarfile_path)
@@ -1536,13 +1585,12 @@ def download_tarball(spec, unsigned=False, mirrors_for_spec=None):
return None
-def make_package_relative(workdir, spec, allow_root):
+def make_package_relative(workdir, spec, buildinfo, allow_root):
"""
Change paths in binaries to relative paths. Change absolute symlinks
to relative symlinks.
"""
prefix = spec.prefix
- buildinfo = read_buildinfo_file(workdir)
old_layout_root = buildinfo["buildpath"]
orig_path_names = list()
cur_path_names = list()
@@ -1566,9 +1614,8 @@ def make_package_relative(workdir, spec, allow_root):
relocate.make_link_relative(cur_path_names, orig_path_names)
-def ensure_package_relocatable(workdir, binaries_dir):
+def ensure_package_relocatable(buildinfo, binaries_dir):
"""Check if package binaries are relocatable."""
- buildinfo = read_buildinfo_file(workdir)
binaries = [os.path.join(binaries_dir, f) for f in buildinfo["relocate_binaries"]]
relocate.ensure_binaries_are_relocatable(binaries)
diff --git a/lib/spack/spack/test/bindist.py b/lib/spack/spack/test/bindist.py
index 81f5f7377c..cde16c241e 100644
--- a/lib/spack/spack/test/bindist.py
+++ b/lib/spack/spack/test/bindist.py
@@ -2,11 +2,13 @@
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+import filecmp
import glob
import io
import os
import platform
import sys
+import tarfile
import urllib.error
import urllib.request
import urllib.response
@@ -952,3 +954,81 @@ def test_correct_specs_are_pushed(
bindist.push([spec], push_url, include_root=root, include_dependencies=deps)
assert packages_to_push == expected
+
+
+def test_reproducible_tarball_is_reproducible(tmpdir):
+ p = tmpdir.mkdir("prefix")
+ p.mkdir("bin")
+ p.mkdir(".spack")
+
+ app = p.join("bin", "app")
+
+ tarball_1 = str(tmpdir.join("prefix-1.tar.gz"))
+ tarball_2 = str(tmpdir.join("prefix-2.tar.gz"))
+
+ with open(app, "w") as f:
+ f.write("hello world")
+
+ buildinfo = {"metadata": "yes please"}
+
+ # Create a tarball with a certain mtime of bin/app
+ os.utime(app, times=(0, 0))
+ bindist._do_create_tarball(tarball_1, binaries_dir=p, pkg_dir="pkg", buildinfo=buildinfo)
+
+ # Do it another time with different mtime of bin/app
+ os.utime(app, times=(10, 10))
+ bindist._do_create_tarball(tarball_2, binaries_dir=p, pkg_dir="pkg", buildinfo=buildinfo)
+
+ # They should be bitwise identical:
+ assert filecmp.cmp(tarball_1, tarball_2, shallow=False)
+
+ # Sanity check for contents:
+ with tarfile.open(tarball_1, mode="r") as f:
+ for m in f.getmembers():
+ assert m.uid == m.gid == m.mtime == 0
+ assert m.uname == m.gname == ""
+
+ assert set(f.getnames()) == {
+ "pkg",
+ "pkg/bin",
+ "pkg/bin/app",
+ "pkg/.spack",
+ "pkg/.spack/binary_distribution",
+ }
+
+
+def test_tarball_normalized_permissions(tmpdir):
+ p = tmpdir.mkdir("prefix")
+ p.mkdir("bin")
+ p.mkdir("share")
+ p.mkdir(".spack")
+
+ app = p.join("bin", "app")
+ data = p.join("share", "file")
+ tarball = str(tmpdir.join("prefix.tar.gz"))
+
+ # Everyone can write & execute. This should turn into 0o755 when the tarball is
+ # extracted (on a different system).
+ with open(app, "w", opener=lambda path, flags: os.open(path, flags, 0o777)) as f:
+ f.write("hello world")
+
+ # User doesn't have execute permissions, but group/world have; this should also
+ # turn into 0o644 (user read/write, group&world only read).
+ with open(data, "w", opener=lambda path, flags: os.open(path, flags, 0o477)) as f:
+ f.write("hello world")
+
+ bindist._do_create_tarball(tarball, binaries_dir=p, pkg_dir="pkg", buildinfo={})
+
+ with tarfile.open(tarball) as tar:
+ path_to_member = {member.name: member for member in tar.getmembers()}
+
+ # directories should have 0o755
+ assert path_to_member["pkg"].mode == 0o755
+ assert path_to_member["pkg/bin"].mode == 0o755
+ assert path_to_member["pkg/.spack"].mode == 0o755
+
+ # executable-by-user files should be 0o755
+ assert path_to_member["pkg/bin/app"].mode == 0o755
+
+ # not-executable-by-user files should be 0o644
+ assert path_to_member["pkg/share/file"].mode == 0o644