From ba1d29502388ba85d9c45df4ac70c8b980a8720d Mon Sep 17 00:00:00 2001
From: Massimiliano Culpo <massimiliano.culpo@gmail.com>
Date: Mon, 7 Aug 2023 12:47:52 +0200
Subject: Extract prefix locks and failure markers from Database (#39024)

This PR extracts two responsibilities from the `Database` class:
1. Managing locks for prefixes during an installation
2. Marking installation failures

and pushes them into their own class (`SpecLocker` and `FailureMarker`). These responsibilities are also pushed up into the `Store`, leaving to `Database` only the duty to manage `index.json` files.

`SpecLocker` classes no longer share a global list of locks, but locks are per instance. Their identifier is simply `(dag hash, package name)`, and not the spec prefix path, to avoid circular dependencies across Store / Database / Spec.
---
 lib/spack/spack/cmd/clean.py        |   3 +-
 lib/spack/spack/database.py         | 500 ++++++++++++++++++------------------
 lib/spack/spack/installer.py        |  23 +-
 lib/spack/spack/package_base.py     |   2 +-
 lib/spack/spack/stage.py            |   2 +-
 lib/spack/spack/store.py            |  37 ++-
 lib/spack/spack/test/cmd/clean.py   |   4 +-
 lib/spack/spack/test/cmd/install.py |   9 +-
 lib/spack/spack/test/conftest.py    |  13 +-
 lib/spack/spack/test/database.py    |  63 ++---
 lib/spack/spack/test/install.py     |   4 +-
 lib/spack/spack/test/installer.py   |  80 +++---
 12 files changed, 357 insertions(+), 383 deletions(-)

(limited to 'lib')

diff --git a/lib/spack/spack/cmd/clean.py b/lib/spack/spack/cmd/clean.py
index 93edcd712f..89c1a90fc3 100644
--- a/lib/spack/spack/cmd/clean.py
+++ b/lib/spack/spack/cmd/clean.py
@@ -17,6 +17,7 @@ import spack.cmd.test
 import spack.config
 import spack.repo
 import spack.stage
+import spack.store
 import spack.util.path
 from spack.paths import lib_path, var_path
 
@@ -121,7 +122,7 @@ def clean(parser, args):
 
     if args.failures:
         tty.msg("Removing install failure marks")
-        spack.installer.clear_failures()
+        spack.store.STORE.failure_tracker.clear_all()
 
     if args.misc_cache:
         tty.msg("Removing cached information on repositories")
diff --git a/lib/spack/spack/database.py b/lib/spack/spack/database.py
index cc3a586218..ebca0f4850 100644
--- a/lib/spack/spack/database.py
+++ b/lib/spack/spack/database.py
@@ -21,10 +21,11 @@ filesystem.
 import contextlib
 import datetime
 import os
+import pathlib
 import socket
 import sys
 import time
-from typing import Dict, List, NamedTuple, Set, Type, Union
+from typing import Any, Callable, Dict, Generator, List, NamedTuple, Set, Type, Union
 
 try:
     import uuid
@@ -141,22 +142,23 @@ class InstallStatuses:
     def canonicalize(cls, query_arg):
         if query_arg is True:
             return [cls.INSTALLED]
-        elif query_arg is False:
+        if query_arg is False:
             return [cls.MISSING]
-        elif query_arg is any:
+        if query_arg is any:
             return [cls.INSTALLED, cls.DEPRECATED, cls.MISSING]
-        elif isinstance(query_arg, InstallStatus):
+        if isinstance(query_arg, InstallStatus):
             return [query_arg]
-        else:
-            try:  # Try block catches if it is not an iterable at all
-                if any(type(x) != InstallStatus for x in query_arg):
-                    raise TypeError
-            except TypeError:
-                raise TypeError(
-                    "installation query must be `any`, boolean, "
-                    "InstallStatus, or iterable of InstallStatus"
-                )
-            return query_arg
+        try:
+            statuses = list(query_arg)
+            if all(isinstance(x, InstallStatus) for x in statuses):
+                return statuses
+        except TypeError:
+            pass
+
+        raise TypeError(
+            "installation query must be `any`, boolean, "
+            "InstallStatus, or iterable of InstallStatus"
+        )
 
 
 class InstallRecord:
@@ -306,15 +308,16 @@ _QUERY_DOCSTRING = """
 
         """
 
-#: Data class to configure locks in Database objects
-#:
-#: Args:
-#:    enable (bool): whether to enable locks or not.
-#:    database_timeout (int or None): timeout for the database lock
-#:    package_timeout (int or None): timeout for the package lock
-
 
 class LockConfiguration(NamedTuple):
+    """Data class to configure locks in Database objects
+
+    Args:
+        enable: whether to enable locks or not.
+        database_timeout: timeout for the database lock
+        package_timeout: timeout for the package lock
+    """
+
     enable: bool
     database_timeout: Optional[int]
     package_timeout: Optional[int]
@@ -348,13 +351,230 @@ def lock_configuration(configuration):
     )
 
 
-class Database:
-    #: Per-process lock objects for each install prefix
-    _prefix_locks: Dict[str, lk.Lock] = {}
+def prefix_lock_path(root_dir: Union[str, pathlib.Path]) -> pathlib.Path:
+    """Returns the path of the prefix lock file, given the root directory.
+
+    Args:
+        root_dir: root directory containing the database directory
+    """
+    return pathlib.Path(root_dir) / _DB_DIRNAME / "prefix_lock"
+
+
+def failures_lock_path(root_dir: Union[str, pathlib.Path]) -> pathlib.Path:
+    """Returns the path of the failures lock file, given the root directory.
+
+    Args:
+        root_dir: root directory containing the database directory
+    """
+    return pathlib.Path(root_dir) / _DB_DIRNAME / "prefix_failures"
+
+
+class SpecLocker:
+    """Manages acquiring and releasing read or write locks on concrete specs."""
+
+    def __init__(self, lock_path: Union[str, pathlib.Path], default_timeout: Optional[float]):
+        self.lock_path = pathlib.Path(lock_path)
+        self.default_timeout = default_timeout
+
+        # Maps (spec.dag_hash(), spec.name) to the corresponding lock object
+        self.locks: Dict[Tuple[str, str], lk.Lock] = {}
+
+    def lock(self, spec: "spack.spec.Spec", timeout: Optional[float] = None) -> lk.Lock:
+        """Returns a lock on a concrete spec.
+
+        The lock is a byte range lock on the nth byte of a file.
+
+        The lock file is ``self.lock_path``.
+
+        n is the sys.maxsize-bit prefix of the DAG hash.  This makes likelihood of collision is
+        very low AND it gives us readers-writer lock semantics with just a single lockfile, so
+        no cleanup required.
+        """
+        assert spec.concrete, "cannot lock a non-concrete spec"
+        timeout = timeout or self.default_timeout
+        key = self._lock_key(spec)
+
+        if key not in self.locks:
+            self.locks[key] = self.raw_lock(spec, timeout=timeout)
+        else:
+            self.locks[key].default_timeout = timeout
+
+        return self.locks[key]
+
+    def raw_lock(self, spec: "spack.spec.Spec", timeout: Optional[float] = None) -> lk.Lock:
+        """Returns a raw lock for a Spec, but doesn't keep track of it."""
+        return lk.Lock(
+            str(self.lock_path),
+            start=spec.dag_hash_bit_prefix(bit_length(sys.maxsize)),
+            length=1,
+            default_timeout=timeout,
+            desc=spec.name,
+        )
+
+    def has_lock(self, spec: "spack.spec.Spec") -> bool:
+        """Returns True if the spec is already managed by this spec locker"""
+        return self._lock_key(spec) in self.locks
+
+    def _lock_key(self, spec: "spack.spec.Spec") -> Tuple[str, str]:
+        return (spec.dag_hash(), spec.name)
+
+    @contextlib.contextmanager
+    def write_lock(self, spec: "spack.spec.Spec") -> Generator["SpecLocker", None, None]:
+        lock = self.lock(spec)
+        lock.acquire_write()
+
+        try:
+            yield self
+        except lk.LockError:
+            # This addresses the case where a nested lock attempt fails inside
+            # of this context manager
+            raise
+        except (Exception, KeyboardInterrupt):
+            lock.release_write()
+            raise
+        else:
+            lock.release_write()
+
+    def clear(self, spec: "spack.spec.Spec") -> Tuple[bool, Optional[lk.Lock]]:
+        key = self._lock_key(spec)
+        lock = self.locks.pop(key, None)
+        return bool(lock), lock
+
+    def clear_all(self, clear_fn: Optional[Callable[[lk.Lock], Any]] = None) -> None:
+        if clear_fn is not None:
+            for lock in self.locks.values():
+                clear_fn(lock)
+        self.locks.clear()
+
+
+class FailureTracker:
+    """Tracks installation failures.
+
+    Prefix failure marking takes the form of a byte range lock on the nth
+    byte of a file for coordinating between concurrent parallel build
+    processes and a persistent file, named with the full hash and
+    containing the spec, in a subdirectory of the database to enable
+    persistence across overlapping but separate related build processes.
+
+    The failure lock file lives alongside the install DB.
+
+    ``n`` is the sys.maxsize-bit prefix of the associated DAG hash to make
+    the likelihood of collision very low with no cleanup required.
+    """
+
+    def __init__(self, root_dir: Union[str, pathlib.Path], default_timeout: Optional[float]):
+        #: Ensure a persistent location for dealing with parallel installation
+        #: failures (e.g., across near-concurrent processes).
+        self.dir = pathlib.Path(root_dir) / _DB_DIRNAME / "failures"
+        self.dir.mkdir(parents=True, exist_ok=True)
+
+        self.locker = SpecLocker(failures_lock_path(root_dir), default_timeout=default_timeout)
+
+    def clear(self, spec: "spack.spec.Spec", force: bool = False) -> None:
+        """Removes any persistent and cached failure tracking for the spec.
+
+        see `mark()`.
+
+        Args:
+            spec: the spec whose failure indicators are being removed
+            force: True if the failure information should be cleared when a failure lock
+                exists for the file, or False if the failure should not be cleared (e.g.,
+                it may be associated with a concurrent build)
+        """
+        locked = self.lock_taken(spec)
+        if locked and not force:
+            tty.msg(f"Retaining failure marking for {spec.name} due to lock")
+            return
+
+        if locked:
+            tty.warn(f"Removing failure marking despite lock for {spec.name}")
+
+        succeeded, lock = self.locker.clear(spec)
+        if succeeded and lock is not None:
+            lock.release_write()
+
+        if self.persistent_mark(spec):
+            path = self._path(spec)
+            tty.debug(f"Removing failure marking for {spec.name}")
+            try:
+                path.unlink()
+            except OSError as err:
+                tty.warn(
+                    f"Unable to remove failure marking for {spec.name} ({str(path)}): {str(err)}"
+                )
+
+    def clear_all(self) -> None:
+        """Force remove install failure tracking files."""
+        tty.debug("Releasing prefix failure locks")
+        self.locker.clear_all(
+            clear_fn=lambda x: x.release_write() if x.is_write_locked() else True
+        )
+
+        tty.debug("Removing prefix failure tracking files")
+        try:
+            for fail_mark in os.listdir(str(self.dir)):
+                try:
+                    (self.dir / fail_mark).unlink()
+                except OSError as exc:
+                    tty.warn(f"Unable to remove failure marking file {fail_mark}: {str(exc)}")
+        except OSError as exc:
+            tty.warn(f"Unable to remove failure marking files: {str(exc)}")
+
+    def mark(self, spec: "spack.spec.Spec") -> lk.Lock:
+        """Marks a spec as failing to install.
+
+        Args:
+            spec: spec that failed to install
+        """
+        # Dump the spec to the failure file for (manual) debugging purposes
+        path = self._path(spec)
+        path.write_text(spec.to_json())
+
+        # Also ensure a failure lock is taken to prevent cleanup removal
+        # of failure status information during a concurrent parallel build.
+        if not self.locker.has_lock(spec):
+            try:
+                mark = self.locker.lock(spec)
+                mark.acquire_write()
+            except lk.LockTimeoutError:
+                # Unlikely that another process failed to install at the same
+                # time but log it anyway.
+                tty.debug(f"PID {os.getpid()} failed to mark install failure for {spec.name}")
+                tty.warn(f"Unable to mark {spec.name} as failed.")
+
+        return self.locker.lock(spec)
+
+    def has_failed(self, spec: "spack.spec.Spec") -> bool:
+        """Return True if the spec is marked as failed."""
+        # The failure was detected in this process.
+        if self.locker.has_lock(spec):
+            return True
+
+        # The failure was detected by a concurrent process (e.g., an srun),
+        # which is expected to be holding a write lock if that is the case.
+        if self.lock_taken(spec):
+            return True
+
+        # Determine if the spec may have been marked as failed by a separate
+        # spack build process running concurrently.
+        return self.persistent_mark(spec)
+
+    def lock_taken(self, spec: "spack.spec.Spec") -> bool:
+        """Return True if another process has a failure lock on the spec."""
+        check = self.locker.raw_lock(spec)
+        return check.is_write_locked()
+
+    def persistent_mark(self, spec: "spack.spec.Spec") -> bool:
+        """Determine if the spec has a persistent failure marking."""
+        return self._path(spec).exists()
+
+    def _path(self, spec: "spack.spec.Spec") -> pathlib.Path:
+        """Return the path to the spec's failure file, which may not exist."""
+        assert spec.concrete, "concrete spec required for failure path"
+        return self.dir / f"{spec.name}-{spec.dag_hash()}"
 
-    #: Per-process failure (lock) objects for each install prefix
-    _prefix_failures: Dict[str, lk.Lock] = {}
 
+class Database:
     #: Fields written for each install record
     record_fields: Tuple[str, ...] = DEFAULT_INSTALL_RECORD_FIELDS
 
@@ -392,24 +612,10 @@ class Database:
         self._verifier_path = os.path.join(self.database_directory, "index_verifier")
         self._lock_path = os.path.join(self.database_directory, "lock")
 
-        # This is for other classes to use to lock prefix directories.
-        self.prefix_lock_path = os.path.join(self.database_directory, "prefix_lock")
-
-        # Ensure a persistent location for dealing with parallel installation
-        # failures (e.g., across near-concurrent processes).
-        self._failure_dir = os.path.join(self.database_directory, "failures")
-
-        # Support special locks for handling parallel installation failures
-        # of a spec.
-        self.prefix_fail_path = os.path.join(self.database_directory, "prefix_failures")
-
         # Create needed directories and files
         if not is_upstream and not os.path.exists(self.database_directory):
             fs.mkdirp(self.database_directory)
 
-        if not is_upstream and not os.path.exists(self._failure_dir):
-            fs.mkdirp(self._failure_dir)
-
         self.is_upstream = is_upstream
         self.last_seen_verifier = ""
         # Failed write transactions (interrupted by exceptions) will alert
@@ -423,15 +629,7 @@ class Database:
 
         # initialize rest of state.
         self.db_lock_timeout = lock_cfg.database_timeout
-        self.package_lock_timeout = lock_cfg.package_timeout
-
         tty.debug("DATABASE LOCK TIMEOUT: {0}s".format(str(self.db_lock_timeout)))
-        timeout_format_str = (
-            "{0}s".format(str(self.package_lock_timeout))
-            if self.package_lock_timeout
-            else "No timeout"
-        )
-        tty.debug("PACKAGE LOCK TIMEOUT: {0}".format(str(timeout_format_str)))
 
         self.lock: Union[ForbiddenLock, lk.Lock]
         if self.is_upstream:
@@ -471,212 +669,6 @@ class Database:
         """Get a read lock context manager for use in a `with` block."""
         return self._read_transaction_impl(self.lock, acquire=self._read)
 
-    def _failed_spec_path(self, spec):
-        """Return the path to the spec's failure file, which may not exist."""
-        if not spec.concrete:
-            raise ValueError("Concrete spec required for failure path for {0}".format(spec.name))
-
-        return os.path.join(self._failure_dir, "{0}-{1}".format(spec.name, spec.dag_hash()))
-
-    def clear_all_failures(self) -> None:
-        """Force remove install failure tracking files."""
-        tty.debug("Releasing prefix failure locks")
-        for pkg_id in list(self._prefix_failures.keys()):
-            lock = self._prefix_failures.pop(pkg_id, None)
-            if lock:
-                lock.release_write()
-
-        # Remove all failure markings (aka files)
-        tty.debug("Removing prefix failure tracking files")
-        for fail_mark in os.listdir(self._failure_dir):
-            try:
-                os.remove(os.path.join(self._failure_dir, fail_mark))
-            except OSError as exc:
-                tty.warn(
-                    "Unable to remove failure marking file {0}: {1}".format(fail_mark, str(exc))
-                )
-
-    def clear_failure(self, spec: "spack.spec.Spec", force: bool = False) -> None:
-        """
-        Remove any persistent and cached failure tracking for the spec.
-
-        see `mark_failed()`.
-
-        Args:
-            spec: the spec whose failure indicators are being removed
-            force: True if the failure information should be cleared when a prefix failure
-                lock exists for the file, or False if the failure should not be cleared (e.g.,
-                it may be associated with a concurrent build)
-        """
-        failure_locked = self.prefix_failure_locked(spec)
-        if failure_locked and not force:
-            tty.msg("Retaining failure marking for {0} due to lock".format(spec.name))
-            return
-
-        if failure_locked:
-            tty.warn("Removing failure marking despite lock for {0}".format(spec.name))
-
-        lock = self._prefix_failures.pop(spec.prefix, None)
-        if lock:
-            lock.release_write()
-
-        if self.prefix_failure_marked(spec):
-            try:
-                path = self._failed_spec_path(spec)
-                tty.debug("Removing failure marking for {0}".format(spec.name))
-                os.remove(path)
-            except OSError as err:
-                tty.warn(
-                    "Unable to remove failure marking for {0} ({1}): {2}".format(
-                        spec.name, path, str(err)
-                    )
-                )
-
-    def mark_failed(self, spec: "spack.spec.Spec") -> lk.Lock:
-        """
-        Mark a spec as failing to install.
-
-        Prefix failure marking takes the form of a byte range lock on the nth
-        byte of a file for coordinating between concurrent parallel build
-        processes and a persistent file, named with the full hash and
-        containing the spec, in a subdirectory of the database to enable
-        persistence across overlapping but separate related build processes.
-
-        The failure lock file, ``spack.store.STORE.db.prefix_failures``, lives
-        alongside the install DB. ``n`` is the sys.maxsize-bit prefix of the
-        associated DAG hash to make the likelihood of collision very low with
-        no cleanup required.
-        """
-        # Dump the spec to the failure file for (manual) debugging purposes
-        path = self._failed_spec_path(spec)
-        with open(path, "w") as f:
-            spec.to_json(f)
-
-        # Also ensure a failure lock is taken to prevent cleanup removal
-        # of failure status information during a concurrent parallel build.
-        err = "Unable to mark {0.name} as failed."
-
-        prefix = spec.prefix
-        if prefix not in self._prefix_failures:
-            mark = lk.Lock(
-                self.prefix_fail_path,
-                start=spec.dag_hash_bit_prefix(bit_length(sys.maxsize)),
-                length=1,
-                default_timeout=self.package_lock_timeout,
-                desc=spec.name,
-            )
-
-            try:
-                mark.acquire_write()
-            except lk.LockTimeoutError:
-                # Unlikely that another process failed to install at the same
-                # time but log it anyway.
-                tty.debug(
-                    "PID {0} failed to mark install failure for {1}".format(os.getpid(), spec.name)
-                )
-                tty.warn(err.format(spec))
-
-            # Whether we or another process marked it as a failure, track it
-            # as such locally.
-            self._prefix_failures[prefix] = mark
-
-        return self._prefix_failures[prefix]
-
-    def prefix_failed(self, spec: "spack.spec.Spec") -> bool:
-        """Return True if the prefix (installation) is marked as failed."""
-        # The failure was detected in this process.
-        if spec.prefix in self._prefix_failures:
-            return True
-
-        # The failure was detected by a concurrent process (e.g., an srun),
-        # which is expected to be holding a write lock if that is the case.
-        if self.prefix_failure_locked(spec):
-            return True
-
-        # Determine if the spec may have been marked as failed by a separate
-        # spack build process running concurrently.
-        return self.prefix_failure_marked(spec)
-
-    def prefix_failure_locked(self, spec: "spack.spec.Spec") -> bool:
-        """Return True if a process has a failure lock on the spec."""
-        check = lk.Lock(
-            self.prefix_fail_path,
-            start=spec.dag_hash_bit_prefix(bit_length(sys.maxsize)),
-            length=1,
-            default_timeout=self.package_lock_timeout,
-            desc=spec.name,
-        )
-
-        return check.is_write_locked()
-
-    def prefix_failure_marked(self, spec: "spack.spec.Spec") -> bool:
-        """Determine if the spec has a persistent failure marking."""
-        return os.path.exists(self._failed_spec_path(spec))
-
-    def prefix_lock(self, spec: "spack.spec.Spec", timeout: Optional[float] = None) -> lk.Lock:
-        """Get a lock on a particular spec's installation directory.
-
-        NOTE: The installation directory **does not** need to exist.
-
-        Prefix lock is a byte range lock on the nth byte of a file.
-
-        The lock file is ``spack.store.STORE.db.prefix_lock`` -- the DB
-        tells us what to call it and it lives alongside the install DB.
-
-        n is the sys.maxsize-bit prefix of the DAG hash.  This makes
-        likelihood of collision is very low AND it gives us
-        readers-writer lock semantics with just a single lockfile, so no
-        cleanup required.
-        """
-        timeout = timeout or self.package_lock_timeout
-        prefix = spec.prefix
-        if prefix not in self._prefix_locks:
-            self._prefix_locks[prefix] = lk.Lock(
-                self.prefix_lock_path,
-                start=spec.dag_hash_bit_prefix(bit_length(sys.maxsize)),
-                length=1,
-                default_timeout=timeout,
-                desc=spec.name,
-            )
-        elif timeout != self._prefix_locks[prefix].default_timeout:
-            self._prefix_locks[prefix].default_timeout = timeout
-
-        return self._prefix_locks[prefix]
-
-    @contextlib.contextmanager
-    def prefix_read_lock(self, spec):
-        prefix_lock = self.prefix_lock(spec)
-        prefix_lock.acquire_read()
-
-        try:
-            yield self
-        except lk.LockError:
-            # This addresses the case where a nested lock attempt fails inside
-            # of this context manager
-            raise
-        except (Exception, KeyboardInterrupt):
-            prefix_lock.release_read()
-            raise
-        else:
-            prefix_lock.release_read()
-
-    @contextlib.contextmanager
-    def prefix_write_lock(self, spec):
-        prefix_lock = self.prefix_lock(spec)
-        prefix_lock.acquire_write()
-
-        try:
-            yield self
-        except lk.LockError:
-            # This addresses the case where a nested lock attempt fails inside
-            # of this context manager
-            raise
-        except (Exception, KeyboardInterrupt):
-            prefix_lock.release_write()
-            raise
-        else:
-            prefix_lock.release_write()
-
     def _write_to_file(self, stream):
         """Write out the database in JSON format to the stream passed
         as argument.
diff --git a/lib/spack/spack/installer.py b/lib/spack/spack/installer.py
index 92d12e04cb..5b88808496 100644
--- a/lib/spack/spack/installer.py
+++ b/lib/spack/spack/installer.py
@@ -519,13 +519,6 @@ def _try_install_from_binary_cache(
     )
 
 
-def clear_failures() -> None:
-    """
-    Remove all failure tracking markers for the Spack instance.
-    """
-    spack.store.STORE.db.clear_all_failures()
-
-
 def combine_phase_logs(phase_log_files: List[str], log_path: str) -> None:
     """
     Read set or list of logs and combine them into one file.
@@ -1126,15 +1119,13 @@ class PackageInstaller:
     instance.
     """
 
-    def __init__(self, installs: List[Tuple["spack.package_base.PackageBase", dict]] = []):
+    def __init__(self, installs: List[Tuple["spack.package_base.PackageBase", dict]] = []) -> None:
         """Initialize the installer.
 
         Args:
             installs (list): list of tuples, where each
                 tuple consists of a package (PackageBase) and its associated
                  install arguments (dict)
-        Return:
-            PackageInstaller: instance
         """
         # List of build requests
         self.build_requests = [BuildRequest(pkg, install_args) for pkg, install_args in installs]
@@ -1287,7 +1278,7 @@ class PackageInstaller:
             dep_id = package_id(dep_pkg)
 
             # Check for failure since a prefix lock is not required
-            if spack.store.STORE.db.prefix_failed(dep):
+            if spack.store.STORE.failure_tracker.has_failed(dep):
                 action = "'spack install' the dependency"
                 msg = "{0} is marked as an install failure: {1}".format(dep_id, action)
                 raise InstallError(err.format(request.pkg_id, msg), pkg=dep_pkg)
@@ -1502,7 +1493,7 @@ class PackageInstaller:
             if lock is None:
                 tty.debug(msg.format("Acquiring", desc, pkg_id, pretty_seconds(timeout or 0)))
                 op = "acquire"
-                lock = spack.store.STORE.db.prefix_lock(pkg.spec, timeout)
+                lock = spack.store.STORE.prefix_locker.lock(pkg.spec, timeout)
                 if timeout != lock.default_timeout:
                     tty.warn(
                         "Expected prefix lock timeout {0}, not {1}".format(
@@ -1627,12 +1618,12 @@ class PackageInstaller:
                 # Clear any persistent failure markings _unless_ they are
                 # associated with another process in this parallel build
                 # of the spec.
-                spack.store.STORE.db.clear_failure(dep, force=False)
+                spack.store.STORE.failure_tracker.clear(dep, force=False)
 
         install_package = request.install_args.get("install_package")
         if install_package and request.pkg_id not in self.build_tasks:
             # Be sure to clear any previous failure
-            spack.store.STORE.db.clear_failure(request.spec, force=True)
+            spack.store.STORE.failure_tracker.clear(request.spec, force=True)
 
             # If not installing dependencies, then determine their
             # installation status before proceeding
@@ -1888,7 +1879,7 @@ class PackageInstaller:
         err = "" if exc is None else ": {0}".format(str(exc))
         tty.debug("Flagging {0} as failed{1}".format(pkg_id, err))
         if mark:
-            self.failed[pkg_id] = spack.store.STORE.db.mark_failed(task.pkg.spec)
+            self.failed[pkg_id] = spack.store.STORE.failure_tracker.mark(task.pkg.spec)
         else:
             self.failed[pkg_id] = None
         task.status = STATUS_FAILED
@@ -2074,7 +2065,7 @@ class PackageInstaller:
 
             # Flag a failed spec.  Do not need an (install) prefix lock since
             # assume using a separate (failed) prefix lock file.
-            if pkg_id in self.failed or spack.store.STORE.db.prefix_failed(spec):
+            if pkg_id in self.failed or spack.store.STORE.failure_tracker.has_failed(spec):
                 term_status.clear()
                 tty.warn("{0} failed to install".format(pkg_id))
                 self._update_failed(task)
diff --git a/lib/spack/spack/package_base.py b/lib/spack/spack/package_base.py
index 6571f6d1dc..1d3dba5045 100644
--- a/lib/spack/spack/package_base.py
+++ b/lib/spack/spack/package_base.py
@@ -2209,7 +2209,7 @@ class PackageBase(WindowsRPath, PackageViewMixin, metaclass=PackageMeta):
             pkg = None
 
         # Pre-uninstall hook runs first.
-        with spack.store.STORE.db.prefix_write_lock(spec):
+        with spack.store.STORE.prefix_locker.write_lock(spec):
             if pkg is not None:
                 try:
                     spack.hooks.pre_uninstall(spec)
diff --git a/lib/spack/spack/stage.py b/lib/spack/spack/stage.py
index 14754e1f52..1c932da4bb 100644
--- a/lib/spack/spack/stage.py
+++ b/lib/spack/spack/stage.py
@@ -326,7 +326,7 @@ class Stage:
         self.keep = keep
 
         # File lock for the stage directory.  We use one file for all
-        # stage locks. See spack.database.Database.prefix_lock for
+        # stage locks. See spack.database.Database.prefix_locker.lock for
         # details on this approach.
         self._lock = None
         if lock:
diff --git a/lib/spack/spack/store.py b/lib/spack/spack/store.py
index a923080e9f..a0ab11c4c1 100644
--- a/lib/spack/spack/store.py
+++ b/lib/spack/spack/store.py
@@ -25,13 +25,14 @@ import uuid
 from typing import Any, Callable, Dict, Generator, List, Optional, Union
 
 import llnl.util.lang
-import llnl.util.tty as tty
+from llnl.util import tty
 
 import spack.config
 import spack.database
 import spack.directory_layout
 import spack.error
 import spack.paths
+import spack.spec
 import spack.util.path
 
 #: default installation root, relative to the Spack install path
@@ -134,18 +135,21 @@ def parse_install_tree(config_dict):
 class Store:
     """A store is a path full of installed Spack packages.
 
-    Stores consist of packages installed according to a
-    ``DirectoryLayout``, along with an index, or _database_ of their
-    contents.  The directory layout controls what paths look like and how
-    Spack ensures that each unique spec gets its own unique directory (or
-    not, though we don't recommend that). The database is a single file
-    that caches metadata for the entire Spack installation.  It prevents
-    us from having to spider the install tree to figure out what's there.
+    Stores consist of packages installed according to a ``DirectoryLayout``, along with a database
+    of their contents.
+
+    The directory layout controls what paths look like and how Spack ensures that each unique spec
+    gets its own unique directory (or not, though we don't recommend that).
+
+    The database is a single file that caches metadata for the entire Spack installation. It
+    prevents us from having to spider the install tree to figure out what's there.
+
+    The store is also able to lock installation prefixes, and to mark installation failures.
 
     Args:
         root: path to the root of the install tree
-        unpadded_root: path to the root of the install tree without padding.
-            The sbang script has to be installed here to work with padded roots
+        unpadded_root: path to the root of the install tree without padding. The sbang script has
+            to be installed here to work with padded roots
         projections: expression according to guidelines that describes how to construct a path to
             a package prefix in this store
         hash_length: length of the hashes used in the directory layout. Spec hash suffixes will be
@@ -170,6 +174,19 @@ class Store:
         self.upstreams = upstreams
         self.lock_cfg = lock_cfg
         self.db = spack.database.Database(root, upstream_dbs=upstreams, lock_cfg=lock_cfg)
+
+        timeout_format_str = (
+            f"{str(lock_cfg.package_timeout)}s" if lock_cfg.package_timeout else "No timeout"
+        )
+        tty.debug("PACKAGE LOCK TIMEOUT: {0}".format(str(timeout_format_str)))
+
+        self.prefix_locker = spack.database.SpecLocker(
+            spack.database.prefix_lock_path(root), default_timeout=lock_cfg.package_timeout
+        )
+        self.failure_tracker = spack.database.FailureTracker(
+            self.root, default_timeout=lock_cfg.package_timeout
+        )
+
         self.layout = spack.directory_layout.DirectoryLayout(
             root, projections=projections, hash_length=hash_length
         )
diff --git a/lib/spack/spack/test/cmd/clean.py b/lib/spack/spack/test/cmd/clean.py
index 8c711e4c12..12b1ca4991 100644
--- a/lib/spack/spack/test/cmd/clean.py
+++ b/lib/spack/spack/test/cmd/clean.py
@@ -10,9 +10,11 @@ import pytest
 import llnl.util.filesystem as fs
 
 import spack.caches
+import spack.cmd.clean
 import spack.main
 import spack.package_base
 import spack.stage
+import spack.store
 
 clean = spack.main.SpackCommand("clean")
 
@@ -33,7 +35,7 @@ def mock_calls_for_clean(monkeypatch):
     monkeypatch.setattr(spack.stage, "purge", Counter("stages"))
     monkeypatch.setattr(spack.caches.fetch_cache, "destroy", Counter("downloads"), raising=False)
     monkeypatch.setattr(spack.caches.misc_cache, "destroy", Counter("caches"))
-    monkeypatch.setattr(spack.installer, "clear_failures", Counter("failures"))
+    monkeypatch.setattr(spack.store.STORE.failure_tracker, "clear_all", Counter("failures"))
     monkeypatch.setattr(spack.cmd.clean, "remove_python_cache", Counter("python_cache"))
 
     yield counts
diff --git a/lib/spack/spack/test/cmd/install.py b/lib/spack/spack/test/cmd/install.py
index e0518ab73d..872d91f0a8 100644
--- a/lib/spack/spack/test/cmd/install.py
+++ b/lib/spack/spack/test/cmd/install.py
@@ -23,6 +23,7 @@ import spack.config
 import spack.environment as ev
 import spack.hash_types as ht
 import spack.package_base
+import spack.store
 import spack.util.executable
 from spack.error import SpackError
 from spack.main import SpackCommand
@@ -705,9 +706,11 @@ def test_cache_only_fails(tmpdir, mock_fetch, install_mockery, capfd):
     assert "was not installed" in out
 
     # Check that failure prefix locks are still cached
-    failure_lock_prefixes = ",".join(spack.store.STORE.db._prefix_failures.keys())
-    assert "libelf" in failure_lock_prefixes
-    assert "libdwarf" in failure_lock_prefixes
+    failed_packages = [
+        pkg_name for dag_hash, pkg_name in spack.store.STORE.failure_tracker.locker.locks.keys()
+    ]
+    assert "libelf" in failed_packages
+    assert "libdwarf" in failed_packages
 
 
 def test_install_only_dependencies(tmpdir, mock_fetch, install_mockery):
diff --git a/lib/spack/spack/test/conftest.py b/lib/spack/spack/test/conftest.py
index e75585b0db..4f0632faa2 100644
--- a/lib/spack/spack/test/conftest.py
+++ b/lib/spack/spack/test/conftest.py
@@ -950,21 +950,14 @@ def disable_compiler_execution(monkeypatch, request):
 
 
 @pytest.fixture(scope="function")
-def install_mockery(temporary_store, mutable_config, mock_packages):
+def install_mockery(temporary_store: spack.store.Store, mutable_config, mock_packages):
     """Hooks a fake install directory, DB, and stage directory into Spack."""
     # We use a fake package, so temporarily disable checksumming
     with spack.config.override("config:checksum", False):
         yield
 
-    # Also wipe out any cached prefix failure locks (associated with
-    # the session-scoped mock archive).
-    for pkg_id in list(temporary_store.db._prefix_failures.keys()):
-        lock = spack.store.STORE.db._prefix_failures.pop(pkg_id, None)
-        if lock:
-            try:
-                lock.release_write()
-            except Exception:
-                pass
+    # Wipe out any cached prefix failure locks (associated with the session-scoped mock archive)
+    temporary_store.failure_tracker.clear_all()
 
 
 @pytest.fixture(scope="function")
diff --git a/lib/spack/spack/test/database.py b/lib/spack/spack/test/database.py
index 3f5d7887b7..c1e92eedca 100644
--- a/lib/spack/spack/test/database.py
+++ b/lib/spack/spack/test/database.py
@@ -807,22 +807,22 @@ def test_query_spec_with_non_conditional_virtual_dependency(database):
 def test_failed_spec_path_error(database):
     """Ensure spec not concrete check is covered."""
     s = spack.spec.Spec("a")
-    with pytest.raises(ValueError, match="Concrete spec required"):
-        spack.store.STORE.db._failed_spec_path(s)
+    with pytest.raises(AssertionError, match="concrete spec required"):
+        spack.store.STORE.failure_tracker.mark(s)
 
 
 @pytest.mark.db
 def test_clear_failure_keep(mutable_database, monkeypatch, capfd):
     """Add test coverage for clear_failure operation when to be retained."""
 
-    def _is(db, spec):
+    def _is(self, spec):
         return True
 
     # Pretend the spec has been failure locked
-    monkeypatch.setattr(spack.database.Database, "prefix_failure_locked", _is)
+    monkeypatch.setattr(spack.database.FailureTracker, "lock_taken", _is)
 
-    s = spack.spec.Spec("a")
-    spack.store.STORE.db.clear_failure(s)
+    s = spack.spec.Spec("a").concretized()
+    spack.store.STORE.failure_tracker.clear(s)
     out = capfd.readouterr()[0]
     assert "Retaining failure marking" in out
 
@@ -831,16 +831,16 @@ def test_clear_failure_keep(mutable_database, monkeypatch, capfd):
 def test_clear_failure_forced(default_mock_concretization, mutable_database, monkeypatch, capfd):
     """Add test coverage for clear_failure operation when force."""
 
-    def _is(db, spec):
+    def _is(self, spec):
         return True
 
     # Pretend the spec has been failure locked
-    monkeypatch.setattr(spack.database.Database, "prefix_failure_locked", _is)
+    monkeypatch.setattr(spack.database.FailureTracker, "lock_taken", _is)
     # Ensure raise OSError when try to remove the non-existent marking
-    monkeypatch.setattr(spack.database.Database, "prefix_failure_marked", _is)
+    monkeypatch.setattr(spack.database.FailureTracker, "persistent_mark", _is)
 
     s = default_mock_concretization("a")
-    spack.store.STORE.db.clear_failure(s, force=True)
+    spack.store.STORE.failure_tracker.clear(s, force=True)
     out = capfd.readouterr()[1]
     assert "Removing failure marking despite lock" in out
     assert "Unable to remove failure marking" in out
@@ -858,55 +858,34 @@ def test_mark_failed(default_mock_concretization, mutable_database, monkeypatch,
 
     with tmpdir.as_cwd():
         s = default_mock_concretization("a")
-        spack.store.STORE.db.mark_failed(s)
+        spack.store.STORE.failure_tracker.mark(s)
 
         out = str(capsys.readouterr()[1])
         assert "Unable to mark a as failed" in out
 
-        # Clean up the failure mark to ensure it does not interfere with other
-        # tests using the same spec.
-        del spack.store.STORE.db._prefix_failures[s.prefix]
+    spack.store.STORE.failure_tracker.clear_all()
 
 
 @pytest.mark.db
 def test_prefix_failed(default_mock_concretization, mutable_database, monkeypatch):
-    """Add coverage to prefix_failed operation."""
-
-    def _is(db, spec):
-        return True
+    """Add coverage to failed operation."""
 
     s = default_mock_concretization("a")
 
     # Confirm the spec is not already marked as failed
-    assert not spack.store.STORE.db.prefix_failed(s)
+    assert not spack.store.STORE.failure_tracker.has_failed(s)
 
     # Check that a failure entry is sufficient
-    spack.store.STORE.db._prefix_failures[s.prefix] = None
-    assert spack.store.STORE.db.prefix_failed(s)
+    spack.store.STORE.failure_tracker.mark(s)
+    assert spack.store.STORE.failure_tracker.has_failed(s)
 
     # Remove the entry and check again
-    del spack.store.STORE.db._prefix_failures[s.prefix]
-    assert not spack.store.STORE.db.prefix_failed(s)
+    spack.store.STORE.failure_tracker.clear(s)
+    assert not spack.store.STORE.failure_tracker.has_failed(s)
 
     # Now pretend that the prefix failure is locked
-    monkeypatch.setattr(spack.database.Database, "prefix_failure_locked", _is)
-    assert spack.store.STORE.db.prefix_failed(s)
-
-
-def test_prefix_read_lock_error(default_mock_concretization, mutable_database, monkeypatch):
-    """Cover the prefix read lock exception."""
-
-    def _raise(db, spec):
-        raise lk.LockError("Mock lock error")
-
-    s = default_mock_concretization("a")
-
-    # Ensure subsequent lock operations fail
-    monkeypatch.setattr(lk.Lock, "acquire_read", _raise)
-
-    with pytest.raises(Exception):
-        with spack.store.STORE.db.prefix_read_lock(s):
-            assert False
+    monkeypatch.setattr(spack.database.FailureTracker, "lock_taken", lambda self, spec: True)
+    assert spack.store.STORE.failure_tracker.has_failed(s)
 
 
 def test_prefix_write_lock_error(default_mock_concretization, mutable_database, monkeypatch):
@@ -921,7 +900,7 @@ def test_prefix_write_lock_error(default_mock_concretization, mutable_database,
     monkeypatch.setattr(lk.Lock, "acquire_write", _raise)
 
     with pytest.raises(Exception):
-        with spack.store.STORE.db.prefix_write_lock(s):
+        with spack.store.STORE.prefix_locker.write_lock(s):
             assert False
 
 
diff --git a/lib/spack/spack/test/install.py b/lib/spack/spack/test/install.py
index a6ac33fe96..76ab4d4698 100644
--- a/lib/spack/spack/test/install.py
+++ b/lib/spack/spack/test/install.py
@@ -159,7 +159,7 @@ def test_partial_install_delete_prefix_and_stage(install_mockery, mock_fetch, wo
         s.package.remove_prefix = rm_prefix_checker.remove_prefix
 
         # must clear failure markings for the package before re-installing it
-        spack.store.STORE.db.clear_failure(s, True)
+        spack.store.STORE.failure_tracker.clear(s, True)
 
         s.package.set_install_succeed()
         s.package.stage = MockStage(s.package.stage)
@@ -354,7 +354,7 @@ def test_partial_install_keep_prefix(install_mockery, mock_fetch, monkeypatch, w
     assert os.path.exists(s.package.prefix)
 
     # must clear failure markings for the package before re-installing it
-    spack.store.STORE.db.clear_failure(s, True)
+    spack.store.STORE.failure_tracker.clear(s, True)
 
     s.package.set_install_succeed()
     s.package.stage = MockStage(s.package.stage)
diff --git a/lib/spack/spack/test/installer.py b/lib/spack/spack/test/installer.py
index a937f8f770..15888e683e 100644
--- a/lib/spack/spack/test/installer.py
+++ b/lib/spack/spack/test/installer.py
@@ -19,6 +19,7 @@ import spack.binary_distribution
 import spack.compilers
 import spack.concretize
 import spack.config
+import spack.database
 import spack.installer as inst
 import spack.package_base
 import spack.package_prefs as prefs
@@ -364,7 +365,7 @@ def test_ensure_locked_err(install_mockery, monkeypatch, tmpdir, capsys):
     """Test _ensure_locked when a non-lock exception is raised."""
     mock_err_msg = "Mock exception error"
 
-    def _raise(lock, timeout):
+    def _raise(lock, timeout=None):
         raise RuntimeError(mock_err_msg)
 
     const_arg = installer_args(["trivial-install-test-package"], {})
@@ -432,7 +433,7 @@ def test_ensure_locked_new_lock(install_mockery, tmpdir, lock_type, reads, write
 
 
 def test_ensure_locked_new_warn(install_mockery, monkeypatch, tmpdir, capsys):
-    orig_pl = spack.database.Database.prefix_lock
+    orig_pl = spack.database.SpecLocker.lock
 
     def _pl(db, spec, timeout):
         lock = orig_pl(db, spec, timeout)
@@ -444,7 +445,7 @@ def test_ensure_locked_new_warn(install_mockery, monkeypatch, tmpdir, capsys):
     installer = create_installer(const_arg)
     spec = installer.build_requests[0].pkg.spec
 
-    monkeypatch.setattr(spack.database.Database, "prefix_lock", _pl)
+    monkeypatch.setattr(spack.database.SpecLocker, "lock", _pl)
 
     lock_type = "read"
     ltype, lock = installer._ensure_locked(lock_type, spec.package)
@@ -597,59 +598,50 @@ def test_dump_packages_deps_errs(install_mockery, tmpdir, monkeypatch, capsys):
     assert "Couldn't copy in provenance for cmake" in out
 
 
-def test_clear_failures_success(install_mockery):
+def test_clear_failures_success(tmpdir):
     """Test the clear_failures happy path."""
+    failures = spack.database.FailureTracker(str(tmpdir), default_timeout=0.1)
 
-    # Set up a test prefix failure lock
-    lock = lk.Lock(
-        spack.store.STORE.db.prefix_fail_path, start=1, length=1, default_timeout=1e-9, desc="test"
-    )
-    try:
-        lock.acquire_write()
-    except lk.LockTimeoutError:
-        tty.warn("Failed to write lock the test install failure")
-    spack.store.STORE.db._prefix_failures["test"] = lock
+    spec = spack.spec.Spec("a")
+    spec._mark_concrete()
 
-    # Set up a fake failure mark (or file)
-    fs.touch(os.path.join(spack.store.STORE.db._failure_dir, "test"))
+    # Set up a test prefix failure lock
+    failures.mark(spec)
+    assert failures.has_failed(spec)
 
     # Now clear failure tracking
-    inst.clear_failures()
+    failures.clear_all()
 
     # Ensure there are no cached failure locks or failure marks
-    assert len(spack.store.STORE.db._prefix_failures) == 0
-    assert len(os.listdir(spack.store.STORE.db._failure_dir)) == 0
+    assert len(failures.locker.locks) == 0
+    assert len(os.listdir(failures.dir)) == 0
 
     # Ensure the core directory and failure lock file still exist
-    assert os.path.isdir(spack.store.STORE.db._failure_dir)
+    assert os.path.isdir(failures.dir)
+
     # Locks on windows are a no-op
     if sys.platform != "win32":
-        assert os.path.isfile(spack.store.STORE.db.prefix_fail_path)
+        assert os.path.isfile(failures.locker.lock_path)
 
 
-def test_clear_failures_errs(install_mockery, monkeypatch, capsys):
+@pytest.mark.xfail(sys.platform == "win32", reason="chmod does not prevent removal on Win")
+def test_clear_failures_errs(tmpdir, capsys):
     """Test the clear_failures exception paths."""
-    orig_fn = os.remove
-    err_msg = "Mock os remove"
-
-    def _raise_except(path):
-        raise OSError(err_msg)
-
-    # Set up a fake failure mark (or file)
-    fs.touch(os.path.join(spack.store.STORE.db._failure_dir, "test"))
+    failures = spack.database.FailureTracker(str(tmpdir), default_timeout=0.1)
+    spec = spack.spec.Spec("a")
+    spec._mark_concrete()
+    failures.mark(spec)
 
-    monkeypatch.setattr(os, "remove", _raise_except)
+    # Make the file marker not writeable, so that clearing_failures fails
+    failures.dir.chmod(0o000)
 
     # Clear failure tracking
-    inst.clear_failures()
+    failures.clear_all()
 
     # Ensure expected warning generated
     out = str(capsys.readouterr()[1])
     assert "Unable to remove failure" in out
-    assert err_msg in out
-
-    # Restore remove for teardown
-    monkeypatch.setattr(os, "remove", orig_fn)
+    failures.dir.chmod(0o750)
 
 
 def test_combine_phase_logs(tmpdir):
@@ -694,14 +686,18 @@ def test_combine_phase_logs_does_not_care_about_encoding(tmpdir):
         assert f.read() == data * 2
 
 
-def test_check_deps_status_install_failure(install_mockery, monkeypatch):
+def test_check_deps_status_install_failure(install_mockery):
+    """Tests that checking the dependency status on a request to install
+    'a' fails, if we mark the dependency as failed.
+    """
+    s = spack.spec.Spec("a").concretized()
+    for dep in s.traverse(root=False):
+        spack.store.STORE.failure_tracker.mark(dep)
+
     const_arg = installer_args(["a"], {})
     installer = create_installer(const_arg)
     request = installer.build_requests[0]
 
-    # Make sure the package is identified as failed
-    monkeypatch.setattr(spack.database.Database, "prefix_failed", _true)
-
     with pytest.raises(inst.InstallError, match="install failure"):
         installer._check_deps_status(request)
 
@@ -1006,7 +1002,7 @@ def test_install_failed(install_mockery, monkeypatch, capsys):
     installer = create_installer(const_arg)
 
     # Make sure the package is identified as failed
-    monkeypatch.setattr(spack.database.Database, "prefix_failed", _true)
+    monkeypatch.setattr(spack.database.FailureTracker, "has_failed", _true)
 
     with pytest.raises(inst.InstallError, match="request failed"):
         installer.install()
@@ -1022,7 +1018,7 @@ def test_install_failed_not_fast(install_mockery, monkeypatch, capsys):
     installer = create_installer(const_arg)
 
     # Make sure the package is identified as failed
-    monkeypatch.setattr(spack.database.Database, "prefix_failed", _true)
+    monkeypatch.setattr(spack.database.FailureTracker, "has_failed", _true)
 
     with pytest.raises(inst.InstallError, match="request failed"):
         installer.install()
@@ -1121,7 +1117,7 @@ def test_install_fail_fast_on_detect(install_mockery, monkeypatch, capsys):
     #
     # This will prevent b from installing, which will cause the build of a
     # to be skipped.
-    monkeypatch.setattr(spack.database.Database, "prefix_failed", _true)
+    monkeypatch.setattr(spack.database.FailureTracker, "has_failed", _true)
 
     with pytest.raises(inst.InstallError, match="after first install failure"):
         installer.install()
-- 
cgit v1.2.3-70-g09d2