From a8aad95d41f0f1d4677281dc443b2c996dc4e1eb Mon Sep 17 00:00:00 2001 From: Todd Gamblin Date: Tue, 16 Aug 2016 11:29:36 -0700 Subject: Specs now cache result of "fast" in-memory hash. - Hash causes major slowdown for reading/setting up large DBs - New version caches hash for concrete specs, which includes all specs in the install DB --- lib/spack/spack/spec.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/spack/spack/spec.py b/lib/spack/spack/spec.py index d8a7cf9d7b..bc7250eec5 100644 --- a/lib/spack/spack/spec.py +++ b/lib/spack/spack/spec.py @@ -504,6 +504,7 @@ class Spec(object): self.variants.spec = self self.namespace = other.namespace self._hash = other._hash + self._cmp_key_cache = other._cmp_key_cache # Specs are by default not assumed to be normal, but in some # cases we've read them from a file want to assume normal. @@ -858,9 +859,10 @@ class Spec(object): # Edge traversal yields but skips children of visited nodes if not (key in visited and cover == 'edges'): # This code determines direction and yields the children/parents + successors = deps if direction == 'parents': - successors = self.dependents_dict() + successors = self.dependents_dict() # TODO: deptype? visited.add(key) for name in sorted(successors): @@ -1278,15 +1280,15 @@ class Spec(object): # Mark everything in the spec as concrete, as well. self._mark_concrete() - def _mark_concrete(self): + def _mark_concrete(self, value=True): """Mark this spec and its dependencies as concrete. Only for internal use -- client code should use "concretize" unless there is a need to force a spec to be concrete. """ for s in self.traverse(deptype_query=alldeps): - s._normal = True - s._concrete = True + s._normal = value + s._concrete = value def concretized(self): """This is a non-destructive version of concretize(). First clones, @@ -1533,6 +1535,10 @@ class Spec(object): if self._normal and not force: return False + # avoid any assumptions about concreteness when forced + if force: + self._mark_concrete(False) + # Ensure first that all packages & compilers in the DAG exist. self.validate_names() # Get all the dependencies into one DependencyMap @@ -2059,10 +2065,17 @@ class Spec(object): 1. A tuple describing this node in the DAG. 2. The hash of each of this node's dependencies' cmp_keys. """ - dep_dict = self.dependencies_dict(deptype=('link', 'run')) - return self._cmp_node() + ( - tuple(hash(dep_dict[name]) - for name in sorted(dep_dict)),) + if self._cmp_key_cache: + return self._cmp_key_cache + + dep_tuple = tuple( + (d.spec.name, hash(d.spec), tuple(sorted(d.deptypes))) + for name, d in sorted(self._dependencies.items())) + + key = (self._cmp_node(), dep_tuple) + if self._concrete: + self._cmp_key_cache = key + return key def colorized(self): return colorize_spec(self) @@ -2457,6 +2470,7 @@ class SpecParser(spack.parse.Parser): spec._dependencies = DependencyMap() spec.namespace = spec_namespace spec._hash = None + spec._cmp_key_cache = None spec._normal = False spec._concrete = False -- cgit v1.2.3-70-g09d2 From 409e7a2e64f1010e961d23e2f3cc97a10180a95f Mon Sep 17 00:00:00 2001 From: Todd Gamblin Date: Tue, 16 Aug 2016 13:13:04 -0700 Subject: Faster database loading. - use a 3-pass algorithm to load the installed package DAG. - avoid redundant hashing/comparing on load. --- lib/spack/spack/database.py | 54 +++++++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/lib/spack/spack/database.py b/lib/spack/spack/database.py index 1240e9a658..ba29b8da30 100644 --- a/lib/spack/spack/database.py +++ b/lib/spack/spack/database.py @@ -198,7 +198,7 @@ class Database(object): except YAMLError as e: raise SpackYAMLError("error writing YAML database:", str(e)) - def _read_spec_from_yaml(self, hash_key, installs, parent_key=None): + def _read_spec_from_yaml(self, hash_key, installs): """Recursively construct a spec from a hash in a YAML database. Does not do any locking. @@ -212,19 +212,27 @@ class Database(object): # Build spec from dict first. spec = Spec.from_node_dict(spec_dict) + return spec + def _assign_dependencies(self, hash_key, installs, data): # Add dependencies from other records in the install DB to # form a full spec. + spec = data[hash_key].spec + spec_dict = installs[hash_key]['spec'] + if 'dependencies' in spec_dict[spec.name]: yaml_deps = spec_dict[spec.name]['dependencies'] for dname, dhash, dtypes in Spec.read_yaml_dep_specs(yaml_deps): - child = self._read_spec_from_yaml(dhash, installs, hash_key) - spec._add_dependency(child, dtypes) + if dhash not in data: + tty.warn("Missing dependency not in database: ", + "%s needs %s-%s" % ( + spec.format('$_$#'), dname, dhash[:7])) + continue - # Specs from the database need to be marked concrete because - # they represent actual installations. - spec._mark_concrete() - return spec + # defensive copy (not sure everything handles extra + # parent links yet) + child = data[dhash].spec + spec._add_dependency(child, dtypes) def _read_from_yaml(self, stream): """ @@ -267,22 +275,22 @@ class Database(object): self.reindex(spack.install_layout) installs = dict((k, v.to_dict()) for k, v in self._data.items()) - # Iterate through database and check each record. + # Build up the database in three passes: + # + # 1. Read in all specs without dependencies. + # 2. Hook dependencies up among specs. + # 3. Mark all specs concrete. + # + # The database is built up so that ALL specs in it share nodes + # (i.e., its specs are a true Merkle DAG, unlike most specs.) + + # Pass 1: Iterate through database and build specs w/o dependencies data = {} for hash_key, rec in installs.items(): try: # This constructs a spec DAG from the list of all installs spec = self._read_spec_from_yaml(hash_key, installs) - # Validate the spec by ensuring the stored and actual - # hashes are the same. - spec_hash = spec.dag_hash() - if not spec_hash == hash_key: - tty.warn( - "Hash mismatch in database: %s -> spec with hash %s" % - (hash_key, spec_hash)) - continue # TODO: is skipping the right thing to do? - # Insert the brand new spec in the database. Each # spec has its own copies of its dependency specs. # TODO: would a more immmutable spec implementation simplify @@ -296,6 +304,18 @@ class Database(object): "cause: %s: %s" % (type(e).__name__, str(e))) raise + # Pass 2: Assign dependencies once all specs are created. + for hash_key in data: + self._assign_dependencies(hash_key, installs, data) + + # Pass 3: Mark all specs concrete. Specs representing real + # installations must be explicitly marked. + # We do this *after* all dependencies are connected because if we + # do it *while* we're constructing specs,it causes hashes to be + # cached prematurely. + for hash_key, rec in data.items(): + rec.spec._mark_concrete() + self._data = data def reindex(self, directory_layout): -- cgit v1.2.3-70-g09d2 From 235a045d080dfe06e1738a23c1907f4221b66733 Mon Sep 17 00:00:00 2001 From: Todd Gamblin Date: Tue, 16 Aug 2016 13:13:59 -0700 Subject: Add option to copy only certain deptypes to Spec.copy() - can now pass these to Spec.copy() and Spec._dup(): - deps=True - deps=False - deps=(list of deptypes) - Makes it easy to filter out only part of a spec. --- lib/spack/spack/spec.py | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/spack/spack/spec.py b/lib/spack/spack/spec.py index bc7250eec5..99446d21dd 100644 --- a/lib/spack/spack/spec.py +++ b/lib/spack/spack/spec.py @@ -1871,7 +1871,7 @@ class Spec(object): """Return list of any virtual deps in this spec.""" return [spec for spec in self.traverse() if spec.virtual] - def _dup(self, other, **kwargs): + def _dup(self, other, deps=True, cleardeps=True): """Copy the spec other into self. This is an overwriting copy. It does not copy any dependents (parents), but by default copies dependencies. @@ -1902,7 +1902,7 @@ class Spec(object): self.versions = other.versions.copy() self.architecture = other.architecture self.compiler = other.compiler.copy() if other.compiler else None - if kwargs.get('cleardeps', True): + if cleardeps: self._dependents = DependencyMap() self._dependencies = DependencyMap() self.compiler_flags = other.compiler_flags.copy() @@ -1912,19 +1912,15 @@ class Spec(object): self.external_module = other.external_module self.namespace = other.namespace self._hash = other._hash + self._cmp_key_cache = other._cmp_key_cache # If we copy dependencies, preserve DAG structure in the new spec - if kwargs.get('deps', True): + if deps: # This copies the deps from other using _dup(deps=False) - # XXX(deptype): We can keep different instances of specs here iff - # it is only a 'build' dependency (from its parent). - # All other instances must be shared (due to symbol - # and PATH contention). These should probably search - # for any existing installation which can satisfy the - # build and latch onto that because if 3 things need - # the same build dependency and it is *not* - # available, we only want to build it once. - new_nodes = other.flat_dependencies(deptype_query=alldeps) + deptypes = alldeps + if isinstance(deps, (tuple, list)): + deptypes = deps + new_nodes = other.flat_dependencies(deptypes=deptypes) new_nodes[self.name] = self stack = [other] @@ -1933,6 +1929,9 @@ class Spec(object): new_spec = new_nodes[cur_spec.name] for depspec in cur_spec._dependencies.values(): + if not any(d in deptypes for d in depspec.deptypes): + continue + stack.append(depspec.spec) # XXX(deptype): add any new deptypes that may have appeared @@ -1948,13 +1947,22 @@ class Spec(object): self.external_module = other.external_module return changed - def copy(self, **kwargs): + def copy(self, deps=True): """Return a copy of this spec. - By default, returns a deep copy. Supply dependencies=False - to get a shallow copy. + + By default, returns a deep copy. To control how dependencies are + copied, supply: + + deps=True: deep copy + + deps=False: shallow copy (no dependencies) + + deps=('link', 'build'): + only build and link dependencies. Similar for other deptypes. + """ clone = Spec.__new__(Spec) - clone._dup(self, **kwargs) + clone._dup(self, deps=deps) return clone @property -- cgit v1.2.3-70-g09d2 From bee5c055682cfcbac294c19890cdcdd0b20cc03d Mon Sep 17 00:00:00 2001 From: Todd Gamblin Date: Tue, 16 Aug 2016 13:14:58 -0700 Subject: Update tests to reflect new in-memory hashing vs. coarser dag_hash. - Spack currently not hashing build deps (to allow more reuse of packages and less frequent re-installing) - Fast in-memory hash should still hash *all* deptypes, and installed specs will only reflect link and run deps. - We'll revert this when we can concretize more liberally based on what is already installed. --- lib/spack/spack/test/directory_layout.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/spack/spack/test/directory_layout.py b/lib/spack/spack/test/directory_layout.py index 2d0565acae..fdaa43464b 100644 --- a/lib/spack/spack/test/directory_layout.py +++ b/lib/spack/spack/test/directory_layout.py @@ -91,22 +91,33 @@ class DirectoryLayoutTest(MockPackagesTest): # Make sure spec file can be read back in to get the original spec spec_from_file = self.layout.read_spec(spec_path) - self.assertEqual(spec, spec_from_file) - self.assertTrue(spec.eq_dag, spec_from_file) + + # currently we don't store build dependency information when + # we write out specs to the filesystem. + + # TODO: fix this when we can concretize more loosely based on + # TODO: what is installed. We currently omit these to + # TODO: increase reuse of build dependencies. + stored_deptypes = ('link', 'run') + expected = spec.copy(deps=stored_deptypes) + self.assertEqual(expected, spec_from_file) + self.assertTrue(expected.eq_dag, spec_from_file) self.assertTrue(spec_from_file.concrete) # Ensure that specs that come out "normal" are really normal. with open(spec_path) as spec_file: read_separately = Spec.from_yaml(spec_file.read()) - read_separately.normalize() - self.assertEqual(read_separately, spec_from_file) + # TODO: revise this when build deps are in dag_hash + norm = read_separately.normalized().copy(deps=stored_deptypes) + self.assertEqual(norm, spec_from_file) - read_separately.concretize() - self.assertEqual(read_separately, spec_from_file) + # TODO: revise this when build deps are in dag_hash + conc = read_separately.concretized().copy(deps=stored_deptypes) + self.assertEqual(conc, spec_from_file) # Make sure the hash of the read-in spec is the same - self.assertEqual(spec.dag_hash(), spec_from_file.dag_hash()) + self.assertEqual(expected.dag_hash(), spec_from_file.dag_hash()) # Ensure directories are properly removed self.layout.remove_install_directory(spec) -- cgit v1.2.3-70-g09d2 From 69b68153a173f6a6f273755c9f56da452cb5970e Mon Sep 17 00:00:00 2001 From: Todd Gamblin Date: Thu, 1 Sep 2016 11:13:26 -0700 Subject: Fix `spack reindex` so that it will work if DB is corrupt (duh). - Transaction logic had gotten complicated -- DB would not reindex when corrupt, rather the error would be reported (ugh). - DB will now print the error and force a rebuild when errors are detected reading the old databse. --- lib/spack/spack/database.py | 61 +++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/spack/spack/database.py b/lib/spack/spack/database.py index ba29b8da30..043e3b953d 100644 --- a/lib/spack/spack/database.py +++ b/lib/spack/spack/database.py @@ -164,6 +164,9 @@ class Database(object): self.lock = Lock(self._lock_path) self._data = {} + # whether there was an error at the start of a read transaction + self._error = None + def write_transaction(self, timeout=_db_lock_timeout): """Get a write lock context manager for use in a `with` block.""" return WriteTransaction(self.lock, self._read, self._write, timeout) @@ -256,7 +259,8 @@ class Database(object): def check(cond, msg): if not cond: - raise CorruptDatabaseError(self._index_path, msg) + raise CorruptDatabaseError( + "Spack database is corrupt: %s" % msg, self._index_path) check('database' in yfile, "No 'database' attribute in YAML.") @@ -275,6 +279,12 @@ class Database(object): self.reindex(spack.install_layout) installs = dict((k, v.to_dict()) for k, v in self._data.items()) + def invalid_record(hash_key, error): + msg = ("Invalid record in Spack database: " + "hash: %s, cause: %s: %s") + msg %= (hash_key, type(e).__name__, str(e)) + raise CorruptDatabaseError(msg, self._index_path) + # Build up the database in three passes: # # 1. Read in all specs without dependencies. @@ -298,15 +308,14 @@ class Database(object): data[hash_key] = InstallRecord.from_dict(spec, rec) except Exception as e: - tty.warn("Invalid database reecord:", - "file: %s" % self._index_path, - "hash: %s" % hash_key, - "cause: %s: %s" % (type(e).__name__, str(e))) - raise + invalid_record(hash_key, e) # Pass 2: Assign dependencies once all specs are created. for hash_key in data: - self._assign_dependencies(hash_key, installs, data) + try: + self._assign_dependencies(hash_key, installs, data) + except Exception as e: + invalid_record(hash_key, e) # Pass 3: Mark all specs concrete. Specs representing real # installations must be explicitly marked. @@ -324,7 +333,26 @@ class Database(object): Locks the DB if it isn't locked already. """ - with self.write_transaction(): + # Special transaction to avoid recursive reindex calls and to + # ignore errors if we need to rebuild a corrupt database. + def _read_suppress_error(): + try: + if os.path.isfile(self._index_path): + self._read_from_yaml(self._index_path) + except CorruptDatabaseError as e: + self._error = e + self._data = {} + + transaction = WriteTransaction( + self.lock, _read_suppress_error, self._write, _db_lock_timeout) + + with transaction: + if self._error: + tty.warn( + "Spack database was corrupt. Will rebuild. Error was:", + str(self._error)) + self._error = None + old_data = self._data try: self._data = {} @@ -333,10 +361,15 @@ class Database(object): for spec in directory_layout.all_specs(): # Create a spec for each known package and add it. path = directory_layout.path_for_spec(spec) - old_info = old_data.get(spec.dag_hash()) + + # Try to recover explicit value from old DB, but + # default it to False if DB was corrupt. explicit = False - if old_info is not None: - explicit = old_info.explicit + if old_data is not None: + old_info = old_data.get(spec.dag_hash()) + if old_info is not None: + explicit = old_info.explicit + self._add(spec, path, directory_layout, explicit=explicit) self._check_ref_counts() @@ -621,11 +654,7 @@ class Database(object): class CorruptDatabaseError(SpackError): - - def __init__(self, path, msg=''): - super(CorruptDatabaseError, self).__init__( - "Spack database is corrupt: %s. %s." % (path, msg), - "Try running `spack reindex` to fix.") + """Raised when errors are found while reading the database.""" class InvalidDatabaseVersionError(SpackError): -- cgit v1.2.3-70-g09d2