From c85877566f4793ae97848d9c23828d9fe8dd7edd Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Fri, 28 Apr 2023 15:24:24 +0200 Subject: Reduce the number of stat calls in "spack verify" (#37251) Spack comes to a crawl post-install of nvhpc, which is partly thanks to this post install hook which has a lot of redundancy, and isn't correct. 1. There's no need to store "type" because that _is_ "mode". 2. There are more file types than "symlink", "dir", "file". 3. Don't checksum device type things 4. Don't run 3 stat calls (exists, stat, isdir/islink), but one lstat call 5. Don't read entire files into memory I also don't know why `spack.crypto` wasn't used for checksumming, but I guess it's too late for that now. Finally md5 would've been the faster algorithm, which would've been fine given that a non cryptographicall checksum was used anyways. --- lib/spack/spack/test/verification.py | 29 +++++-------- lib/spack/spack/verify.py | 84 ++++++++++++++++-------------------- 2 files changed, 47 insertions(+), 66 deletions(-) diff --git a/lib/spack/spack/test/verification.py b/lib/spack/spack/test/verification.py index 86527f51d5..d23a430ab8 100644 --- a/lib/spack/spack/test/verification.py +++ b/lib/spack/spack/test/verification.py @@ -6,6 +6,7 @@ """Tests for the `spack.verify` module""" import os import shutil +import stat import sys import pytest @@ -30,22 +31,12 @@ def test_link_manifest_entry(tmpdir): os.symlink(file, link) data = spack.verify.create_manifest_entry(link) - assert data["type"] == "link" assert data["dest"] == file assert all(x in data for x in ("mode", "owner", "group")) results = spack.verify.check_entry(link, data) assert not results.has_errors() - data["type"] = "garbage" - - results = spack.verify.check_entry(link, data) - assert results.has_errors() - assert link in results.errors - assert results.errors[link] == ["type"] - - data["type"] = "link" - file2 = str(tmpdir.join("file2")) open(file2, "a").close() os.remove(link) @@ -64,18 +55,18 @@ def test_dir_manifest_entry(tmpdir): fs.mkdirp(dirent) data = spack.verify.create_manifest_entry(dirent) - assert data["type"] == "dir" + assert stat.S_ISDIR(data["mode"]) assert all(x in data for x in ("mode", "owner", "group")) results = spack.verify.check_entry(dirent, data) assert not results.has_errors() - data["type"] = "garbage" + data["mode"] = "garbage" results = spack.verify.check_entry(dirent, data) assert results.has_errors() assert dirent in results.errors - assert results.errors[dirent] == ["type"] + assert results.errors[dirent] == ["mode"] def test_file_manifest_entry(tmpdir): @@ -89,25 +80,25 @@ def test_file_manifest_entry(tmpdir): f.write(orig_str) data = spack.verify.create_manifest_entry(file) - assert data["type"] == "file" + assert stat.S_ISREG(data["mode"]) assert data["size"] == len(orig_str) - assert all(x in data for x in ("mode", "owner", "group")) + assert all(x in data for x in ("owner", "group")) results = spack.verify.check_entry(file, data) assert not results.has_errors() - data["type"] = "garbage" + data["mode"] = 0x99999 results = spack.verify.check_entry(file, data) assert results.has_errors() assert file in results.errors - assert results.errors[file] == ["type"] - - data["type"] = "file" + assert results.errors[file] == ["mode"] with open(file, "w") as f: f.write(new_str) + data["mode"] = os.stat(file).st_mode + results = spack.verify.check_entry(file, data) expected = ["size", "hash"] diff --git a/lib/spack/spack/verify.py b/lib/spack/spack/verify.py index a0d4ec3b3f..ead7df597d 100644 --- a/lib/spack/spack/verify.py +++ b/lib/spack/spack/verify.py @@ -5,6 +5,8 @@ import base64 import hashlib import os +import stat +from typing import Any, Dict import llnl.util.tty as tty @@ -14,35 +16,33 @@ import spack.util.file_permissions as fp import spack.util.spack_json as sjson -def compute_hash(path): - with open(path, "rb") as f: - sha1 = hashlib.sha1(f.read()).digest() - b32 = base64.b32encode(sha1) - return b32.decode() +def compute_hash(path: str, block_size: int = 1048576) -> str: + # why is this not using spack.util.crypto.checksum... + hasher = hashlib.sha1() + with open(path, "rb") as file: + while True: + data = file.read(block_size) + if not data: + break + hasher.update(data) + return base64.b32encode(hasher.digest()).decode() -def create_manifest_entry(path): - data = {} - - if os.path.exists(path): - stat = os.stat(path) - - data["mode"] = stat.st_mode - data["owner"] = stat.st_uid - data["group"] = stat.st_gid +def create_manifest_entry(path: str) -> Dict[str, Any]: + try: + s = os.lstat(path) + except OSError: + return {} - if os.path.islink(path): - data["type"] = "link" - data["dest"] = os.readlink(path) + data: Dict[str, Any] = {"mode": s.st_mode, "owner": s.st_uid, "group": s.st_gid} - elif os.path.isdir(path): - data["type"] = "dir" + if stat.S_ISLNK(s.st_mode): + data["dest"] = os.readlink(path) - else: - data["type"] = "file" - data["hash"] = compute_hash(path) - data["time"] = stat.st_mtime - data["size"] = stat.st_size + elif stat.S_ISREG(s.st_mode): + data["hash"] = compute_hash(path) + data["time"] = s.st_mtime + data["size"] = s.st_size return data @@ -75,38 +75,28 @@ def check_entry(path, data): res.add_error(path, "added") return res - stat = os.stat(path) + s = os.lstat(path) # Check for all entries - if stat.st_mode != data["mode"]: - res.add_error(path, "mode") - if stat.st_uid != data["owner"]: + if s.st_uid != data["owner"]: res.add_error(path, "owner") - if stat.st_gid != data["group"]: + if s.st_gid != data["group"]: res.add_error(path, "group") - # Check for symlink targets and listed as symlink - if os.path.islink(path): - if data["type"] != "link": - res.add_error(path, "type") - if os.readlink(path) != data.get("dest", ""): - res.add_error(path, "link") - - # Check directories are listed as directory - elif os.path.isdir(path): - if data["type"] != "dir": - res.add_error(path, "type") - - else: + # In the past, `stat(...).st_mode` was stored + # instead of `lstat(...).st_mode`. So, ignore mode errors for symlinks. + if not stat.S_ISLNK(s.st_mode) and s.st_mode != data["mode"]: + res.add_error(path, "mode") + elif stat.S_ISLNK(s.st_mode) and os.readlink(path) != data.get("dest"): + res.add_error(path, "link") + elif stat.S_ISREG(s.st_mode): # Check file contents against hash and listed as file # Check mtime and size as well - if stat.st_size != data["size"]: + if s.st_size != data["size"]: res.add_error(path, "size") - if stat.st_mtime != data["time"]: + if s.st_mtime != data["time"]: res.add_error(path, "mtime") - if data["type"] != "file": - res.add_error(path, "type") - if compute_hash(path) != data.get("hash", ""): + if compute_hash(path) != data.get("hash"): res.add_error(path, "hash") return res -- cgit v1.2.3-60-g2f50