From 409e7a2e64f1010e961d23e2f3cc97a10180a95f Mon Sep 17 00:00:00 2001
From: Todd Gamblin <tgamblin@llnl.gov>
Date: Tue, 16 Aug 2016 13:13:04 -0700
Subject: Faster database loading.

- use a 3-pass algorithm to load the installed package DAG.

- avoid redundant hashing/comparing on load.
---
 lib/spack/spack/database.py | 54 +++++++++++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/lib/spack/spack/database.py b/lib/spack/spack/database.py
index 1240e9a658..ba29b8da30 100644
--- a/lib/spack/spack/database.py
+++ b/lib/spack/spack/database.py
@@ -198,7 +198,7 @@ class Database(object):
         except YAMLError as e:
             raise SpackYAMLError("error writing YAML database:", str(e))
 
-    def _read_spec_from_yaml(self, hash_key, installs, parent_key=None):
+    def _read_spec_from_yaml(self, hash_key, installs):
         """Recursively construct a spec from a hash in a YAML database.
 
         Does not do any locking.
@@ -212,19 +212,27 @@ class Database(object):
 
         # Build spec from dict first.
         spec = Spec.from_node_dict(spec_dict)
+        return spec
 
+    def _assign_dependencies(self, hash_key, installs, data):
         # Add dependencies from other records in the install DB to
         # form a full spec.
+        spec = data[hash_key].spec
+        spec_dict = installs[hash_key]['spec']
+
         if 'dependencies' in spec_dict[spec.name]:
             yaml_deps = spec_dict[spec.name]['dependencies']
             for dname, dhash, dtypes in Spec.read_yaml_dep_specs(yaml_deps):
-                child = self._read_spec_from_yaml(dhash, installs, hash_key)
-                spec._add_dependency(child, dtypes)
+                if dhash not in data:
+                    tty.warn("Missing dependency not in database: ",
+                             "%s needs %s-%s" % (
+                                 spec.format('$_$#'), dname, dhash[:7]))
+                    continue
 
-        # Specs from the database need to be marked concrete because
-        # they represent actual installations.
-        spec._mark_concrete()
-        return spec
+                # defensive copy (not sure everything handles extra
+                # parent links yet)
+                child = data[dhash].spec
+                spec._add_dependency(child, dtypes)
 
     def _read_from_yaml(self, stream):
         """
@@ -267,22 +275,22 @@ class Database(object):
             self.reindex(spack.install_layout)
             installs = dict((k, v.to_dict()) for k, v in self._data.items())
 
-        # Iterate through database and check each record.
+        # Build up the database in three passes:
+        #
+        #   1. Read in all specs without dependencies.
+        #   2. Hook dependencies up among specs.
+        #   3. Mark all specs concrete.
+        #
+        # The database is built up so that ALL specs in it share nodes
+        # (i.e., its specs are a true Merkle DAG, unlike most specs.)
+
+        # Pass 1: Iterate through database and build specs w/o dependencies
         data = {}
         for hash_key, rec in installs.items():
             try:
                 # This constructs a spec DAG from the list of all installs
                 spec = self._read_spec_from_yaml(hash_key, installs)
 
-                # Validate the spec by ensuring the stored and actual
-                # hashes are the same.
-                spec_hash = spec.dag_hash()
-                if not spec_hash == hash_key:
-                    tty.warn(
-                        "Hash mismatch in database: %s -> spec with hash %s" %
-                        (hash_key, spec_hash))
-                    continue  # TODO: is skipping the right thing to do?
-
                 # Insert the brand new spec in the database.  Each
                 # spec has its own copies of its dependency specs.
                 # TODO: would a more immmutable spec implementation simplify
@@ -296,6 +304,18 @@ class Database(object):
                          "cause: %s: %s" % (type(e).__name__, str(e)))
                 raise
 
+        # Pass 2: Assign dependencies once all specs are created.
+        for hash_key in data:
+            self._assign_dependencies(hash_key, installs, data)
+
+        # Pass 3: Mark all specs concrete.  Specs representing real
+        # installations must be explicitly marked.
+        # We do this *after* all dependencies are connected because if we
+        # do it *while* we're constructing specs,it causes hashes to be
+        # cached prematurely.
+        for hash_key, rec in data.items():
+            rec.spec._mark_concrete()
+
         self._data = data
 
     def reindex(self, directory_layout):
-- 
cgit v1.2.3-70-g09d2