diff options
-rw-r--r-- | lib/spack/spack/binary_distribution.py | 202 | ||||
-rw-r--r-- | lib/spack/spack/fetch_strategy.py | 38 | ||||
-rw-r--r-- | lib/spack/spack/test/oci/integration_test.py | 2 | ||||
-rw-r--r-- | lib/spack/spack/test/util/archive.py | 157 | ||||
-rw-r--r-- | lib/spack/spack/util/archive.py | 228 |
5 files changed, 423 insertions, 204 deletions
diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py index 751ec1ef7f..6f401e4a97 100644 --- a/lib/spack/spack/binary_distribution.py +++ b/lib/spack/spack/binary_distribution.py @@ -5,7 +5,6 @@ import codecs import collections -import errno import hashlib import io import itertools @@ -23,8 +22,7 @@ import urllib.error import urllib.parse import urllib.request import warnings -from contextlib import closing, contextmanager -from gzip import GzipFile +from contextlib import closing from typing import Dict, Iterable, List, NamedTuple, Optional, Set, Tuple from urllib.error import HTTPError, URLError @@ -50,6 +48,7 @@ import spack.repo import spack.stage import spack.store import spack.traverse as traverse +import spack.util.archive import spack.util.crypto import spack.util.file_cache as file_cache import spack.util.gpg @@ -1133,205 +1132,46 @@ def generate_key_index(key_prefix, tmpdir=None): shutil.rmtree(tmpdir) -@contextmanager -def gzip_compressed_tarfile(path): - """Create a reproducible, compressed tarfile""" - # Create gzip compressed tarball of the install prefix - # 1) Use explicit empty filename and mtime 0 for gzip header reproducibility. - # If the filename="" is dropped, Python will use fileobj.name instead. - # This should effectively mimick `gzip --no-name`. - # 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed: - # compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB - # compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB - # So we follow gzip. - with open(path, "wb") as f, ChecksumWriter(f) as inner_checksum, closing( - GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=inner_checksum) - ) as gzip_file, ChecksumWriter(gzip_file) as outer_checksum, tarfile.TarFile( - name="", mode="w", fileobj=outer_checksum - ) as tar: - yield tar, inner_checksum, outer_checksum - - -def _tarinfo_name(absolute_path: str, *, _path=pathlib.PurePath) -> str: - """Compute tarfile entry name as the relative path from the (system) root.""" - return _path(*_path(absolute_path).parts[1:]).as_posix() - - def tarfile_of_spec_prefix(tar: tarfile.TarFile, prefix: str) -> None: """Create a tarfile of an install prefix of a spec. Skips existing buildinfo file. - Only adds regular files, symlinks and dirs. Skips devices, fifos. Preserves hardlinks. - Normalizes permissions like git. Tar entries are added in depth-first pre-order, with - dir entries partitioned by file | dir, and sorted alphabetically, for reproducibility. - Partitioning ensures only one dir is in memory at a time, and sorting improves compression. Args: tar: tarfile object to add files to prefix: absolute install prefix of spec""" if not os.path.isabs(prefix) or not os.path.isdir(prefix): raise ValueError(f"prefix '{prefix}' must be an absolute path to a directory") - hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict() stat_key = lambda stat: (stat.st_dev, stat.st_ino) try: # skip buildinfo file if it exists files_to_skip = [stat_key(os.lstat(buildinfo_file_name(prefix)))] + skip = lambda entry: stat_key(entry.stat(follow_symlinks=False)) in files_to_skip except OSError: - files_to_skip = [] - - # First add all directories leading up to `prefix` (Spack <= 0.21 did not do this, leading to - # issues when tarballs are used in runtimes like AWS lambda). Skip the file system root. - parent_dirs = reversed(pathlib.Path(prefix).parents) - next(parent_dirs) # skip the root: slices are supported from python 3.10 - for parent_dir in parent_dirs: - dir_info = tarfile.TarInfo(_tarinfo_name(str(parent_dir))) - dir_info.type = tarfile.DIRTYPE - dir_info.mode = 0o755 - tar.addfile(dir_info) - - dir_stack = [prefix] - while dir_stack: - dir = dir_stack.pop() - - # Add the dir before its contents - dir_info = tarfile.TarInfo(_tarinfo_name(dir)) - dir_info.type = tarfile.DIRTYPE - dir_info.mode = 0o755 - tar.addfile(dir_info) - - # Sort by name: reproducible & improves compression - with os.scandir(dir) as it: - entries = sorted(it, key=lambda entry: entry.name) - - new_dirs = [] - for entry in entries: - if entry.is_dir(follow_symlinks=False): - new_dirs.append(entry.path) - continue - - file_info = tarfile.TarInfo(_tarinfo_name(entry.path)) - - s = entry.stat(follow_symlinks=False) - - # Skip existing binary distribution files. - id = stat_key(s) - if id in files_to_skip: - continue - - # Normalize the mode - file_info.mode = 0o644 if s.st_mode & 0o100 == 0 else 0o755 - - if entry.is_symlink(): - file_info.type = tarfile.SYMTYPE - file_info.linkname = os.readlink(entry.path) - tar.addfile(file_info) - - elif entry.is_file(follow_symlinks=False): - # Deduplicate hardlinks - if s.st_nlink > 1: - if id in hardlink_to_tarinfo_name: - file_info.type = tarfile.LNKTYPE - file_info.linkname = hardlink_to_tarinfo_name[id] - tar.addfile(file_info) - continue - hardlink_to_tarinfo_name[id] = file_info.name - - # If file not yet seen, copy it. - file_info.type = tarfile.REGTYPE - file_info.size = s.st_size - - with open(entry.path, "rb") as f: - tar.addfile(file_info, f) - - dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical - - -class ChecksumWriter(io.BufferedIOBase): - """Checksum writer computes a checksum while writing to a file.""" - - myfileobj = None - - def __init__(self, fileobj, algorithm=hashlib.sha256): - self.fileobj = fileobj - self.hasher = algorithm() - self.length = 0 - - def hexdigest(self): - return self.hasher.hexdigest() - - def write(self, data): - if isinstance(data, (bytes, bytearray)): - length = len(data) - else: - data = memoryview(data) - length = data.nbytes - - if length > 0: - self.fileobj.write(data) - self.hasher.update(data) - - self.length += length - - return length - - def read(self, size=-1): - raise OSError(errno.EBADF, "read() on write-only object") - - def read1(self, size=-1): - raise OSError(errno.EBADF, "read1() on write-only object") - - def peek(self, n): - raise OSError(errno.EBADF, "peek() on write-only object") - - @property - def closed(self): - return self.fileobj is None - - def close(self): - fileobj = self.fileobj - if fileobj is None: - return - self.fileobj.close() - self.fileobj = None - - def flush(self): - self.fileobj.flush() - - def fileno(self): - return self.fileobj.fileno() - - def rewind(self): - raise OSError("Can't rewind while computing checksum") - - def readable(self): - return False - - def writable(self): - return True - - def seekable(self): - return True - - def tell(self): - return self.fileobj.tell() - - def seek(self, offset, whence=io.SEEK_SET): - # In principle forward seek is possible with b"0" padding, - # but this is not implemented. - if offset == 0 and whence == io.SEEK_CUR: - return - raise OSError("Can't seek while computing checksum") - - def readline(self, size=-1): - raise OSError(errno.EBADF, "readline() on write-only object") + skip = lambda entry: False + + spack.util.archive.reproducible_tarfile_from_prefix( + tar, + prefix, + # Spack <= 0.21 did not include parent directories, leading to issues when tarballs are + # used in runtimes like AWS lambda. + include_parent_directories=True, + skip=skip, + ) def _do_create_tarball(tarfile_path: str, binaries_dir: str, buildinfo: dict): - with gzip_compressed_tarfile(tarfile_path) as (tar, inner_checksum, outer_checksum): + with spack.util.archive.gzip_compressed_tarfile(tarfile_path) as ( + tar, + inner_checksum, + outer_checksum, + ): # Tarball the install prefix tarfile_of_spec_prefix(tar, binaries_dir) # Serialize buildinfo for the tarball bstring = syaml.dump(buildinfo, default_flow_style=True).encode("utf-8") - tarinfo = tarfile.TarInfo(name=_tarinfo_name(buildinfo_file_name(binaries_dir))) + tarinfo = tarfile.TarInfo( + name=spack.util.archive.default_path_to_name(buildinfo_file_name(binaries_dir)) + ) tarinfo.type = tarfile.REGTYPE tarinfo.size = len(bstring) tarinfo.mode = 0o644 diff --git a/lib/spack/spack/fetch_strategy.py b/lib/spack/spack/fetch_strategy.py index 864fcddcc3..93df10c98f 100644 --- a/lib/spack/spack/fetch_strategy.py +++ b/lib/spack/spack/fetch_strategy.py @@ -30,6 +30,7 @@ import re import shutil import urllib.error import urllib.parse +from pathlib import PurePath from typing import List, Optional import llnl.url @@ -37,13 +38,14 @@ import llnl.util import llnl.util.filesystem as fs import llnl.util.tty as tty from llnl.string import comma_and, quote -from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, temp_rename, working_dir +from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, working_dir from llnl.util.symlink import symlink import spack.config import spack.error import spack.oci.opener import spack.url +import spack.util.archive import spack.util.crypto as crypto import spack.util.git import spack.util.url as url_util @@ -600,29 +602,21 @@ class VCSFetchStrategy(FetchStrategy): tty.debug("Source fetched with %s is already expanded." % self.url_attr) @_needs_stage - def archive(self, destination, **kwargs): + def archive(self, destination, *, exclude: Optional[str] = None): assert llnl.url.extension_from_path(destination) == "tar.gz" assert self.stage.source_path.startswith(self.stage.path) - - tar = which("tar", required=True) - - patterns = kwargs.get("exclude", None) - if patterns is not None: - if isinstance(patterns, str): - patterns = [patterns] - for p in patterns: - tar.add_default_arg("--exclude=%s" % p) - - with working_dir(self.stage.path): - if self.stage.srcdir: - # Here we create an archive with the default repository name. - # The 'tar' command has options for changing the name of a - # directory that is included in the archive, but they differ - # based on OS, so we temporarily rename the repo - with temp_rename(self.stage.source_path, self.stage.srcdir): - tar("-czf", destination, self.stage.srcdir) - else: - tar("-czf", destination, os.path.basename(self.stage.source_path)) + # We need to prepend this dir name to every entry of the tarfile + top_level_dir = PurePath(self.stage.srcdir or os.path.basename(self.stage.source_path)) + + with working_dir(self.stage.source_path), spack.util.archive.gzip_compressed_tarfile( + destination + ) as (tar, _, _): + spack.util.archive.reproducible_tarfile_from_prefix( + tar=tar, + prefix=".", + skip=lambda entry: entry.name == exclude, + path_to_name=lambda path: (top_level_dir / PurePath(path)).as_posix(), + ) def __str__(self): return "VCS: %s" % self.url diff --git a/lib/spack/spack/test/oci/integration_test.py b/lib/spack/spack/test/oci/integration_test.py index a2b1ac2f4f..8129dd22cf 100644 --- a/lib/spack/spack/test/oci/integration_test.py +++ b/lib/spack/spack/test/oci/integration_test.py @@ -13,11 +13,11 @@ from contextlib import contextmanager import spack.environment as ev import spack.oci.opener -from spack.binary_distribution import gzip_compressed_tarfile from spack.main import SpackCommand from spack.oci.image import Digest, ImageReference, default_config, default_manifest from spack.oci.oci import blob_exists, get_manifest_and_config, upload_blob, upload_manifest from spack.test.oci.mock_registry import DummyServer, InMemoryOCIRegistry, create_opener +from spack.util.archive import gzip_compressed_tarfile buildcache = SpackCommand("buildcache") mirror = SpackCommand("mirror") diff --git a/lib/spack/spack/test/util/archive.py b/lib/spack/spack/test/util/archive.py new file mode 100644 index 0000000000..9688f25924 --- /dev/null +++ b/lib/spack/spack/test/util/archive.py @@ -0,0 +1,157 @@ +# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +import gzip +import hashlib +import os +import shutil +import tarfile +from pathlib import Path, PurePath + +import spack.util.crypto +from spack.util.archive import gzip_compressed_tarfile, reproducible_tarfile_from_prefix + + +def test_gzip_compressed_tarball_is_reproducible(tmpdir): + """Test gzip_compressed_tarfile and reproducible_tarfile_from_prefix for reproducibility""" + + with tmpdir.as_cwd(): + # Create a few directories + root = Path("root") + dir_a = root / "a" + dir_b = root / "b" + root.mkdir(mode=0o777) + dir_a.mkdir(mode=0o777) + dir_b.mkdir(mode=0o777) + + (root / "y").touch() + (root / "x").touch() + + (dir_a / "executable").touch(mode=0o777) + (dir_a / "data").touch(mode=0o666) + (dir_a / "symlink_file").symlink_to("data") + (dir_a / "symlink_dir").symlink_to(PurePath("..", "b")) + try: + os.link(dir_a / "executable", dir_a / "hardlink") + hardlink_support = True + except OSError: + hardlink_support = False + + (dir_b / "executable").touch(mode=0o777) + (dir_b / "data").touch(mode=0o666) + (dir_b / "symlink_file").symlink_to("data") + (dir_b / "symlink_dir").symlink_to(PurePath("..", "a")) + + # Create the first tarball + with gzip_compressed_tarfile("fst.tar.gz") as (tar, gzip_checksum_1, tarfile_checksum_1): + reproducible_tarfile_from_prefix(tar, "root") + + # Expected mode for non-dirs is 644 if not executable, 755 if executable. Better to compute + # that as we don't know the umask of the user running the test. + expected_mode = ( + lambda name: 0o755 if Path(*name.split("/")).lstat().st_mode & 0o100 else 0o644 + ) + + # Verify the tarball contents + with tarfile.open("fst.tar.gz", "r:gz") as tar: + # Directories (mode is always 755) + for dir in ("root", "root/a", "root/b"): + m = tar.getmember(dir) + assert m.isdir() + assert m.mode == 0o755 + assert m.uid == m.gid == 0 + assert m.uname == m.gname == "" + + # Non-executable regular files + for file in ( + "root/x", + "root/y", + "root/a/data", + "root/b/data", + "root/a/executable", + "root/b/executable", + ): + m = tar.getmember(file) + assert m.isreg() + assert m.mode == expected_mode(file) + assert m.uid == m.gid == 0 + assert m.uname == m.gname == "" + + # Symlinks + for file in ( + "root/a/symlink_file", + "root/a/symlink_dir", + "root/b/symlink_file", + "root/b/symlink_dir", + ): + m = tar.getmember(file) + assert m.issym() + assert m.mode == 0o755 + assert m.uid == m.gid == m.mtime == 0 + assert m.uname == m.gname == "" + + # Verify the symlink targets. Notice that symlink targets are copied verbatim. That + # means the value is platform specific for relative symlinks within the current prefix, + # as on Windows they'd be ..\a and ..\b instead of ../a and ../b. So, reproducilility + # is only guaranteed per-platform currently. + assert PurePath(tar.getmember("root/a/symlink_file").linkname) == PurePath("data") + assert PurePath(tar.getmember("root/b/symlink_file").linkname) == PurePath("data") + assert PurePath(tar.getmember("root/a/symlink_dir").linkname) == PurePath("..", "b") + assert PurePath(tar.getmember("root/b/symlink_dir").linkname) == PurePath("..", "a") + + # Check hardlink if supported + if hardlink_support: + m = tar.getmember("root/a/hardlink") + assert m.islnk() + assert m.mode == expected_mode("root/a/hardlink") + assert m.uid == m.gid == 0 + assert m.uname == m.gname == "" + # Hardlink targets are always in posix format, as they reference a file that exists + # in the tarball. + assert m.linkname == "root/a/executable" + + # Finally verify if entries are ordered by (is_dir, name) + assert [t.name for t in tar.getmembers()] == [ + "root", + "root/x", + "root/y", + "root/a", + "root/a/data", + "root/a/executable", + *(["root/a/hardlink"] if hardlink_support else []), + "root/a/symlink_dir", + "root/a/symlink_file", + "root/b", + "root/b/data", + "root/b/executable", + "root/b/symlink_dir", + "root/b/symlink_file", + ] + + # Delete the current root dir, extract the first tarball, create a second + shutil.rmtree(root) + with tarfile.open("fst.tar.gz", "r:gz") as tar: + tar.extractall() + + # Create the second tarball + with gzip_compressed_tarfile("snd.tar.gz") as (tar, gzip_checksum_2, tarfile_checksum_2): + reproducible_tarfile_from_prefix(tar, "root") + + # Verify the .tar.gz checksums are identical and correct + assert ( + gzip_checksum_1.hexdigest() + == gzip_checksum_2.hexdigest() + == spack.util.crypto.checksum(hashlib.sha256, "fst.tar.gz") + == spack.util.crypto.checksum(hashlib.sha256, "snd.tar.gz") + ) + + # Verify the .tar checksums are identical and correct + with gzip.open("fst.tar.gz", "rb") as f, gzip.open("snd.tar.gz", "rb") as g: + assert ( + tarfile_checksum_1.hexdigest() + == tarfile_checksum_2.hexdigest() + == spack.util.crypto.checksum_stream(hashlib.sha256, f) + == spack.util.crypto.checksum_stream(hashlib.sha256, g) + ) diff --git a/lib/spack/spack/util/archive.py b/lib/spack/spack/util/archive.py new file mode 100644 index 0000000000..8bde40017c --- /dev/null +++ b/lib/spack/spack/util/archive.py @@ -0,0 +1,228 @@ +# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) +import errno +import hashlib +import io +import os +import pathlib +import tarfile +from contextlib import closing, contextmanager +from gzip import GzipFile +from typing import Callable, Dict, Tuple + + +class ChecksumWriter(io.BufferedIOBase): + """Checksum writer computes a checksum while writing to a file.""" + + myfileobj = None + + def __init__(self, fileobj, algorithm=hashlib.sha256): + self.fileobj = fileobj + self.hasher = algorithm() + self.length = 0 + + def hexdigest(self): + return self.hasher.hexdigest() + + def write(self, data): + if isinstance(data, (bytes, bytearray)): + length = len(data) + else: + data = memoryview(data) + length = data.nbytes + + if length > 0: + self.fileobj.write(data) + self.hasher.update(data) + + self.length += length + + return length + + def read(self, size=-1): + raise OSError(errno.EBADF, "read() on write-only object") + + def read1(self, size=-1): + raise OSError(errno.EBADF, "read1() on write-only object") + + def peek(self, n): + raise OSError(errno.EBADF, "peek() on write-only object") + + @property + def closed(self): + return self.fileobj is None + + def close(self): + fileobj = self.fileobj + if fileobj is None: + return + self.fileobj.close() + self.fileobj = None + + def flush(self): + self.fileobj.flush() + + def fileno(self): + return self.fileobj.fileno() + + def rewind(self): + raise OSError("Can't rewind while computing checksum") + + def readable(self): + return False + + def writable(self): + return True + + def seekable(self): + return True + + def tell(self): + return self.fileobj.tell() + + def seek(self, offset, whence=io.SEEK_SET): + # In principle forward seek is possible with b"0" padding, + # but this is not implemented. + if offset == 0 and whence == io.SEEK_CUR: + return + raise OSError("Can't seek while computing checksum") + + def readline(self, size=-1): + raise OSError(errno.EBADF, "readline() on write-only object") + + +@contextmanager +def gzip_compressed_tarfile(path): + """Create a reproducible, gzip compressed tarfile, and keep track of shasums of both the + compressed and uncompressed tarfile. Reproduciblity is achived by normalizing the gzip header + (no file name and zero mtime). + + Yields a tuple of the following: + tarfile.TarFile: tarfile object + ChecksumWriter: checksum of the gzip compressed tarfile + ChecksumWriter: checksum of the uncompressed tarfile + """ + # Create gzip compressed tarball of the install prefix + # 1) Use explicit empty filename and mtime 0 for gzip header reproducibility. + # If the filename="" is dropped, Python will use fileobj.name instead. + # This should effectively mimick `gzip --no-name`. + # 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed: + # compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB + # compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB + # So we follow gzip. + with open(path, "wb") as f, ChecksumWriter(f) as gzip_checksum, closing( + GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=gzip_checksum) + ) as gzip_file, ChecksumWriter(gzip_file) as tarfile_checksum, tarfile.TarFile( + name="", mode="w", fileobj=tarfile_checksum + ) as tar: + yield tar, gzip_checksum, tarfile_checksum + + +def default_path_to_name(path: str) -> str: + """Converts a path to a tarfile name, which uses posix path separators.""" + p = pathlib.PurePath(path) + # Drop the leading slash on posix and the drive letter on windows, and always format as a + # posix path. + return pathlib.PurePath(*p.parts[1:]).as_posix() if p.is_absolute() else p.as_posix() + + +def reproducible_tarfile_from_prefix( + tar: tarfile.TarFile, + prefix: str, + *, + include_parent_directories: bool = False, + skip: Callable[[os.DirEntry], bool] = lambda entry: False, + path_to_name: Callable[[str], str] = default_path_to_name, +) -> None: + """Create a tarball from a given directory. Only adds regular files, symlinks and dirs. + Skips devices, fifos. Preserves hardlinks. Normalizes permissions like git. Tar entries are + added in depth-first pre-order, with dir entries partitioned by file | dir, and sorted + lexicographically, for reproducibility. Partitioning ensures only one dir is in memory at a + time, and sorting improves compression. + + Args: + tar: tarfile object opened in write mode + prefix: path to directory to tar (either absolute or relative) + include_parent_directories: whether to include every directory leading up to ``prefix`` in + the tarball + skip: function that receives a DirEntry and returns True if the entry should be skipped, + whether it is a file or directory. Default implementation does not skip anything. + path_to_name: function that converts a path string to a tarfile entry name, which should be + in posix format. Not only is it necessary to transform paths in certain cases, such as + windows path to posix format, but it can also be used to prepend a directory to each + entry even if it does not exist on the filesystem. The default implementation drops the + leading slash on posix and the drive letter on windows for absolute paths, and formats + as a posix.""" + + hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict() + + if include_parent_directories: + parent_dirs = reversed(pathlib.Path(prefix).parents) + next(parent_dirs) # skip the root: slices are supported from python 3.10 + for parent_dir in parent_dirs: + dir_info = tarfile.TarInfo(path_to_name(str(parent_dir))) + dir_info.type = tarfile.DIRTYPE + dir_info.mode = 0o755 + tar.addfile(dir_info) + + dir_stack = [prefix] + while dir_stack: + dir = dir_stack.pop() + + # Add the dir before its contents + dir_info = tarfile.TarInfo(path_to_name(dir)) + dir_info.type = tarfile.DIRTYPE + dir_info.mode = 0o755 + tar.addfile(dir_info) + + # Sort by name: reproducible & improves compression + with os.scandir(dir) as it: + entries = sorted(it, key=lambda entry: entry.name) + + new_dirs = [] + for entry in entries: + if skip(entry): + continue + + if entry.is_dir(follow_symlinks=False): + new_dirs.append(entry.path) + continue + + file_info = tarfile.TarInfo(path_to_name(entry.path)) + + if entry.is_symlink(): + file_info.type = tarfile.SYMTYPE + file_info.linkname = os.readlink(entry.path) + # According to POSIX: "the value of the file mode bits returned in the + # st_mode field of the stat structure is unspecified." So we set it to + # something sensible without lstat'ing the link. + file_info.mode = 0o755 + tar.addfile(file_info) + + elif entry.is_file(follow_symlinks=False): + # entry.stat has zero (st_ino, st_dev, st_nlink) on Windows: use lstat. + s = os.lstat(entry.path) + + # Normalize permissions like git + file_info.mode = 0o755 if s.st_mode & 0o100 else 0o644 + + # Deduplicate hardlinks + if s.st_nlink > 1: + ident = (s.st_dev, s.st_ino) + if ident in hardlink_to_tarinfo_name: + file_info.type = tarfile.LNKTYPE + file_info.linkname = hardlink_to_tarinfo_name[ident] + tar.addfile(file_info) + continue + hardlink_to_tarinfo_name[ident] = file_info.name + + # If file not yet seen, copy it + file_info.type = tarfile.REGTYPE + file_info.size = s.st_size + + with open(entry.path, "rb") as f: + tar.addfile(file_info, f) + + dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical |