summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHarmen Stoppels <harmenstoppels@gmail.com>2023-04-28 15:24:24 +0200
committerGitHub <noreply@github.com>2023-04-28 13:24:24 +0000
commitc85877566f4793ae97848d9c23828d9fe8dd7edd (patch)
tree437e5abef365ffc364dd1457b50797f4d12e5076
parentfc201a2b7578e49078f066c7544c7b637309af38 (diff)
downloadspack-c85877566f4793ae97848d9c23828d9fe8dd7edd.tar.gz
spack-c85877566f4793ae97848d9c23828d9fe8dd7edd.tar.bz2
spack-c85877566f4793ae97848d9c23828d9fe8dd7edd.tar.xz
spack-c85877566f4793ae97848d9c23828d9fe8dd7edd.zip
Reduce the number of stat calls in "spack verify" (#37251)
Spack comes to a crawl post-install of nvhpc, which is partly thanks to this post install hook which has a lot of redundancy, and isn't correct. 1. There's no need to store "type" because that _is_ "mode". 2. There are more file types than "symlink", "dir", "file". 3. Don't checksum device type things 4. Don't run 3 stat calls (exists, stat, isdir/islink), but one lstat call 5. Don't read entire files into memory I also don't know why `spack.crypto` wasn't used for checksumming, but I guess it's too late for that now. Finally md5 would've been the faster algorithm, which would've been fine given that a non cryptographicall checksum was used anyways.
-rw-r--r--lib/spack/spack/test/verification.py29
-rw-r--r--lib/spack/spack/verify.py84
2 files changed, 47 insertions, 66 deletions
diff --git a/lib/spack/spack/test/verification.py b/lib/spack/spack/test/verification.py
index 86527f51d5..d23a430ab8 100644
--- a/lib/spack/spack/test/verification.py
+++ b/lib/spack/spack/test/verification.py
@@ -6,6 +6,7 @@
"""Tests for the `spack.verify` module"""
import os
import shutil
+import stat
import sys
import pytest
@@ -30,22 +31,12 @@ def test_link_manifest_entry(tmpdir):
os.symlink(file, link)
data = spack.verify.create_manifest_entry(link)
- assert data["type"] == "link"
assert data["dest"] == file
assert all(x in data for x in ("mode", "owner", "group"))
results = spack.verify.check_entry(link, data)
assert not results.has_errors()
- data["type"] = "garbage"
-
- results = spack.verify.check_entry(link, data)
- assert results.has_errors()
- assert link in results.errors
- assert results.errors[link] == ["type"]
-
- data["type"] = "link"
-
file2 = str(tmpdir.join("file2"))
open(file2, "a").close()
os.remove(link)
@@ -64,18 +55,18 @@ def test_dir_manifest_entry(tmpdir):
fs.mkdirp(dirent)
data = spack.verify.create_manifest_entry(dirent)
- assert data["type"] == "dir"
+ assert stat.S_ISDIR(data["mode"])
assert all(x in data for x in ("mode", "owner", "group"))
results = spack.verify.check_entry(dirent, data)
assert not results.has_errors()
- data["type"] = "garbage"
+ data["mode"] = "garbage"
results = spack.verify.check_entry(dirent, data)
assert results.has_errors()
assert dirent in results.errors
- assert results.errors[dirent] == ["type"]
+ assert results.errors[dirent] == ["mode"]
def test_file_manifest_entry(tmpdir):
@@ -89,25 +80,25 @@ def test_file_manifest_entry(tmpdir):
f.write(orig_str)
data = spack.verify.create_manifest_entry(file)
- assert data["type"] == "file"
+ assert stat.S_ISREG(data["mode"])
assert data["size"] == len(orig_str)
- assert all(x in data for x in ("mode", "owner", "group"))
+ assert all(x in data for x in ("owner", "group"))
results = spack.verify.check_entry(file, data)
assert not results.has_errors()
- data["type"] = "garbage"
+ data["mode"] = 0x99999
results = spack.verify.check_entry(file, data)
assert results.has_errors()
assert file in results.errors
- assert results.errors[file] == ["type"]
-
- data["type"] = "file"
+ assert results.errors[file] == ["mode"]
with open(file, "w") as f:
f.write(new_str)
+ data["mode"] = os.stat(file).st_mode
+
results = spack.verify.check_entry(file, data)
expected = ["size", "hash"]
diff --git a/lib/spack/spack/verify.py b/lib/spack/spack/verify.py
index a0d4ec3b3f..ead7df597d 100644
--- a/lib/spack/spack/verify.py
+++ b/lib/spack/spack/verify.py
@@ -5,6 +5,8 @@
import base64
import hashlib
import os
+import stat
+from typing import Any, Dict
import llnl.util.tty as tty
@@ -14,35 +16,33 @@ import spack.util.file_permissions as fp
import spack.util.spack_json as sjson
-def compute_hash(path):
- with open(path, "rb") as f:
- sha1 = hashlib.sha1(f.read()).digest()
- b32 = base64.b32encode(sha1)
- return b32.decode()
+def compute_hash(path: str, block_size: int = 1048576) -> str:
+ # why is this not using spack.util.crypto.checksum...
+ hasher = hashlib.sha1()
+ with open(path, "rb") as file:
+ while True:
+ data = file.read(block_size)
+ if not data:
+ break
+ hasher.update(data)
+ return base64.b32encode(hasher.digest()).decode()
-def create_manifest_entry(path):
- data = {}
-
- if os.path.exists(path):
- stat = os.stat(path)
-
- data["mode"] = stat.st_mode
- data["owner"] = stat.st_uid
- data["group"] = stat.st_gid
+def create_manifest_entry(path: str) -> Dict[str, Any]:
+ try:
+ s = os.lstat(path)
+ except OSError:
+ return {}
- if os.path.islink(path):
- data["type"] = "link"
- data["dest"] = os.readlink(path)
+ data: Dict[str, Any] = {"mode": s.st_mode, "owner": s.st_uid, "group": s.st_gid}
- elif os.path.isdir(path):
- data["type"] = "dir"
+ if stat.S_ISLNK(s.st_mode):
+ data["dest"] = os.readlink(path)
- else:
- data["type"] = "file"
- data["hash"] = compute_hash(path)
- data["time"] = stat.st_mtime
- data["size"] = stat.st_size
+ elif stat.S_ISREG(s.st_mode):
+ data["hash"] = compute_hash(path)
+ data["time"] = s.st_mtime
+ data["size"] = s.st_size
return data
@@ -75,38 +75,28 @@ def check_entry(path, data):
res.add_error(path, "added")
return res
- stat = os.stat(path)
+ s = os.lstat(path)
# Check for all entries
- if stat.st_mode != data["mode"]:
- res.add_error(path, "mode")
- if stat.st_uid != data["owner"]:
+ if s.st_uid != data["owner"]:
res.add_error(path, "owner")
- if stat.st_gid != data["group"]:
+ if s.st_gid != data["group"]:
res.add_error(path, "group")
- # Check for symlink targets and listed as symlink
- if os.path.islink(path):
- if data["type"] != "link":
- res.add_error(path, "type")
- if os.readlink(path) != data.get("dest", ""):
- res.add_error(path, "link")
-
- # Check directories are listed as directory
- elif os.path.isdir(path):
- if data["type"] != "dir":
- res.add_error(path, "type")
-
- else:
+ # In the past, `stat(...).st_mode` was stored
+ # instead of `lstat(...).st_mode`. So, ignore mode errors for symlinks.
+ if not stat.S_ISLNK(s.st_mode) and s.st_mode != data["mode"]:
+ res.add_error(path, "mode")
+ elif stat.S_ISLNK(s.st_mode) and os.readlink(path) != data.get("dest"):
+ res.add_error(path, "link")
+ elif stat.S_ISREG(s.st_mode):
# Check file contents against hash and listed as file
# Check mtime and size as well
- if stat.st_size != data["size"]:
+ if s.st_size != data["size"]:
res.add_error(path, "size")
- if stat.st_mtime != data["time"]:
+ if s.st_mtime != data["time"]:
res.add_error(path, "mtime")
- if data["type"] != "file":
- res.add_error(path, "type")
- if compute_hash(path) != data.get("hash", ""):
+ if compute_hash(path) != data.get("hash"):
res.add_error(path, "hash")
return res