From 409e7a2e64f1010e961d23e2f3cc97a10180a95f Mon Sep 17 00:00:00 2001 From: Todd Gamblin Date: Tue, 16 Aug 2016 13:13:04 -0700 Subject: Faster database loading. - use a 3-pass algorithm to load the installed package DAG. - avoid redundant hashing/comparing on load. --- lib/spack/spack/database.py | 54 +++++++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/lib/spack/spack/database.py b/lib/spack/spack/database.py index 1240e9a658..ba29b8da30 100644 --- a/lib/spack/spack/database.py +++ b/lib/spack/spack/database.py @@ -198,7 +198,7 @@ class Database(object): except YAMLError as e: raise SpackYAMLError("error writing YAML database:", str(e)) - def _read_spec_from_yaml(self, hash_key, installs, parent_key=None): + def _read_spec_from_yaml(self, hash_key, installs): """Recursively construct a spec from a hash in a YAML database. Does not do any locking. @@ -212,19 +212,27 @@ class Database(object): # Build spec from dict first. spec = Spec.from_node_dict(spec_dict) + return spec + def _assign_dependencies(self, hash_key, installs, data): # Add dependencies from other records in the install DB to # form a full spec. + spec = data[hash_key].spec + spec_dict = installs[hash_key]['spec'] + if 'dependencies' in spec_dict[spec.name]: yaml_deps = spec_dict[spec.name]['dependencies'] for dname, dhash, dtypes in Spec.read_yaml_dep_specs(yaml_deps): - child = self._read_spec_from_yaml(dhash, installs, hash_key) - spec._add_dependency(child, dtypes) + if dhash not in data: + tty.warn("Missing dependency not in database: ", + "%s needs %s-%s" % ( + spec.format('$_$#'), dname, dhash[:7])) + continue - # Specs from the database need to be marked concrete because - # they represent actual installations. - spec._mark_concrete() - return spec + # defensive copy (not sure everything handles extra + # parent links yet) + child = data[dhash].spec + spec._add_dependency(child, dtypes) def _read_from_yaml(self, stream): """ @@ -267,22 +275,22 @@ class Database(object): self.reindex(spack.install_layout) installs = dict((k, v.to_dict()) for k, v in self._data.items()) - # Iterate through database and check each record. + # Build up the database in three passes: + # + # 1. Read in all specs without dependencies. + # 2. Hook dependencies up among specs. + # 3. Mark all specs concrete. + # + # The database is built up so that ALL specs in it share nodes + # (i.e., its specs are a true Merkle DAG, unlike most specs.) + + # Pass 1: Iterate through database and build specs w/o dependencies data = {} for hash_key, rec in installs.items(): try: # This constructs a spec DAG from the list of all installs spec = self._read_spec_from_yaml(hash_key, installs) - # Validate the spec by ensuring the stored and actual - # hashes are the same. - spec_hash = spec.dag_hash() - if not spec_hash == hash_key: - tty.warn( - "Hash mismatch in database: %s -> spec with hash %s" % - (hash_key, spec_hash)) - continue # TODO: is skipping the right thing to do? - # Insert the brand new spec in the database. Each # spec has its own copies of its dependency specs. # TODO: would a more immmutable spec implementation simplify @@ -296,6 +304,18 @@ class Database(object): "cause: %s: %s" % (type(e).__name__, str(e))) raise + # Pass 2: Assign dependencies once all specs are created. + for hash_key in data: + self._assign_dependencies(hash_key, installs, data) + + # Pass 3: Mark all specs concrete. Specs representing real + # installations must be explicitly marked. + # We do this *after* all dependencies are connected because if we + # do it *while* we're constructing specs,it causes hashes to be + # cached prematurely. + for hash_key, rec in data.items(): + rec.spec._mark_concrete() + self._data = data def reindex(self, directory_layout): -- cgit v1.2.3-60-g2f50