From 41b8f31bcd299d4ea7dd40138f725f77576278a3 Mon Sep 17 00:00:00 2001 From: Todd Gamblin Date: Mon, 5 Dec 2016 10:03:58 -0800 Subject: Use JSON for the database instead of YAML. (#2189) * Use JSON for the database instead of YAML. - JSON is much faster than YAML *and* can preserve ordered keys. - 170x+ faster than Python YAML when using unordered dicts - 55x faster than Python YAML (both using OrderedDicts) - 8x faster than C YAML (with OrderedDicts) - JSON is built into Python, unlike C YAML, so doesn't add a dependency. - Don't need human readability for the package database. - JSON requires no major changes to the code -- same object model as YAML. - add to_json, from_json methods to spec. * Add tests to ensure JSON and YAML don't need to be ordered in DB. * Write index.json first time it's not found instead of requiring reindex. * flake8 bug. --- bin/spack | 8 --- lib/spack/spack/database.py | 77 +++++++++++++++-------- lib/spack/spack/spec.py | 60 +++++++++++------- lib/spack/spack/test/database.py | 2 +- lib/spack/spack/test/spec_dag.py | 28 --------- lib/spack/spack/test/spec_yaml.py | 124 +++++++++++++++++++++++++++++++++++++ lib/spack/spack/util/spack_json.py | 56 +++++++++++++++++ lib/spack/spack/util/spack_yaml.py | 17 +++-- 8 files changed, 282 insertions(+), 90 deletions(-) create mode 100644 lib/spack/spack/util/spack_json.py diff --git a/bin/spack b/bin/spack index 7cc1b28a37..454a9a5b2d 100755 --- a/bin/spack +++ b/bin/spack @@ -41,14 +41,6 @@ SPACK_PREFIX = os.path.dirname(os.path.dirname(SPACK_FILE)) SPACK_LIB_PATH = os.path.join(SPACK_PREFIX, "lib", "spack") sys.path.insert(0, SPACK_LIB_PATH) -# Try to use system YAML if it is available, as it might have libyaml -# support (for faster loading via C). Load it before anything in -# lib/spack/external so it will take precedence over Spack's PyYAML. -try: - import yaml -except ImportError: - pass # ignore and use slow yaml - # Add external libs SPACK_EXTERNAL_LIBS = os.path.join(SPACK_LIB_PATH, "external") sys.path.insert(0, SPACK_EXTERNAL_LIBS) diff --git a/lib/spack/spack/database.py b/lib/spack/spack/database.py index e8902ec024..a01795ca0f 100644 --- a/lib/spack/spack/database.py +++ b/lib/spack/spack/database.py @@ -55,6 +55,7 @@ from spack.version import Version import spack.spec from spack.error import SpackError import spack.util.spack_yaml as syaml +import spack.util.spack_json as sjson # DB goes in this directory underneath the root @@ -134,10 +135,12 @@ class Database(object): under ``root/.spack-db``, which is created if it does not exist. This is the ``db_dir``. - The Database will attempt to read an ``index.yaml`` file in - ``db_dir``. If it does not find one, it will be created when - needed by scanning the entire Database root for ``spec.yaml`` - files according to Spack's ``DirectoryLayout``. + The Database will attempt to read an ``index.json`` file in + ``db_dir``. If it does not find one, it will fall back to read + an ``index.yaml`` if one is present. If that does not exist, it + will create a database when needed by scanning the entire + Database root for ``spec.yaml`` files according to Spack's + ``DirectoryLayout``. Caller may optionally provide a custom ``db_dir`` parameter where data will be stored. This is intended to be used for @@ -154,7 +157,8 @@ class Database(object): self._db_dir = db_dir # Set up layout of database files within the db dir - self._index_path = join_path(self._db_dir, 'index.yaml') + self._old_yaml_index_path = join_path(self._db_dir, 'index.yaml') + self._index_path = join_path(self._db_dir, 'index.json') self._lock_path = join_path(self._db_dir, 'lock') # This is for other classes to use to lock prefix directories. @@ -179,8 +183,8 @@ class Database(object): """Get a read lock context manager for use in a `with` block.""" return ReadTransaction(self.lock, self._read, timeout=timeout) - def _write_to_yaml(self, stream): - """Write out the databsae to a YAML file. + def _write_to_file(self, stream): + """Write out the databsae to a JSON file. This function does not do any locking or transactions. """ @@ -201,12 +205,12 @@ class Database(object): } try: - return syaml.dump( - database, stream=stream, default_flow_style=False) + sjson.dump(database, stream) except YAMLError as e: - raise SpackYAMLError("error writing YAML database:", str(e)) + raise syaml.SpackYAMLError( + "error writing YAML database:", str(e)) - def _read_spec_from_yaml(self, hash_key, installs): + def _read_spec_from_dict(self, hash_key, installs): """Recursively construct a spec from a hash in a YAML database. Does not do any locking. @@ -241,24 +245,32 @@ class Database(object): child = data[dhash].spec spec._add_dependency(child, dtypes) - def _read_from_yaml(self, stream): + def _read_from_file(self, stream, format='json'): """ - Fill database from YAML, do not maintain old data + Fill database from file, do not maintain old data Translate the spec portions from node-dict form to spec form Does not do any locking. """ + if format.lower() == 'json': + load = sjson.load + elif format.lower() == 'yaml': + load = syaml.load + else: + raise ValueError("Invalid database format: %s" % format) + try: if isinstance(stream, basestring): with open(stream, 'r') as f: - yfile = syaml.load(f) + fdata = load(f) else: - yfile = syaml.load(stream) - + fdata = load(stream) except MarkedYAMLError as e: - raise SpackYAMLError("error parsing YAML database:", str(e)) + raise syaml.SpackYAMLError("error parsing YAML database:", str(e)) + except Exception as e: + raise CorruptDatabaseError("error parsing database:", str(e)) - if yfile is None: + if fdata is None: return def check(cond, msg): @@ -266,10 +278,10 @@ class Database(object): raise CorruptDatabaseError( "Spack database is corrupt: %s" % msg, self._index_path) - check('database' in yfile, "No 'database' attribute in YAML.") + check('database' in fdata, "No 'database' attribute in YAML.") # High-level file checks - db = yfile['database'] + db = fdata['database'] check('installs' in db, "No 'installs' in YAML DB.") check('version' in db, "No 'version' in YAML DB.") @@ -303,7 +315,7 @@ class Database(object): for hash_key, rec in installs.items(): try: # This constructs a spec DAG from the list of all installs - spec = self._read_spec_from_yaml(hash_key, installs) + spec = self._read_spec_from_dict(hash_key, installs) # Insert the brand new spec in the database. Each # spec has its own copies of its dependency specs. @@ -342,7 +354,7 @@ class Database(object): def _read_suppress_error(): try: if os.path.isfile(self._index_path): - self._read_from_yaml(self._index_path) + self._read_from_file(self._index_path) except CorruptDatabaseError as e: self._error = e self._data = {} @@ -426,7 +438,7 @@ class Database(object): # Write a temporary database file them move it into place try: with open(temp_file, 'w') as f: - self._write_to_yaml(f) + self._write_to_file(f) os.rename(temp_file, self._index_path) except: # Clean up temp file if something goes wrong. @@ -437,11 +449,24 @@ class Database(object): def _read(self): """Re-read Database from the data in the set location. - This does no locking. + This does no locking, with one exception: it will automatically + migrate an index.yaml to an index.json if possible. This requires + taking a write lock. + """ if os.path.isfile(self._index_path): - # Read from YAML file if a database exists - self._read_from_yaml(self._index_path) + # Read from JSON file if a JSON database exists + self._read_from_file(self._index_path, format='json') + + elif os.path.isfile(self._old_yaml_index_path): + if os.access(self._db_dir, os.R_OK | os.W_OK): + # if we can write, then read AND write a JSON file. + self._read_from_file(self._old_yaml_index_path, format='yaml') + with WriteTransaction(self.lock, timeout=_db_lock_timeout): + self._write(None, None, None) + else: + # Read chck for a YAML file if we can't find JSON. + self._read_from_file(self._old_yaml_index_path, format='yaml') else: # The file doesn't exist, try to traverse the directory. diff --git a/lib/spack/spack/spec.py b/lib/spack/spack/spec.py index 86eaa3bb91..2ef3757689 100644 --- a/lib/spack/spack/spec.py +++ b/lib/spack/spack/spec.py @@ -117,6 +117,7 @@ from spack.build_environment import get_path_from_module, load_module from spack.util.prefix import Prefix from spack.util.string import * import spack.util.spack_yaml as syaml +import spack.util.spack_json as sjson from spack.util.spack_yaml import syaml_dict from spack.util.crypto import prefix_bits from spack.version import * @@ -153,7 +154,6 @@ __all__ = [ 'UnsatisfiableArchitectureSpecError', 'UnsatisfiableProviderSpecError', 'UnsatisfiableDependencySpecError', - 'SpackYAMLError', 'AmbiguousHashError'] # Valid pattern for an identifier in Spack @@ -1174,15 +1174,21 @@ class Spec(object): return syaml_dict([(self.name, d)]) - def to_yaml(self, stream=None): + def to_dict(self): node_list = [] for s in self.traverse(order='pre', deptype=('link', 'run')): node = s.to_node_dict() node[s.name]['hash'] = s.dag_hash() node_list.append(node) + + return syaml_dict([('spec', node_list)]) + + def to_yaml(self, stream=None): return syaml.dump( - syaml_dict([('spec', node_list)]), - stream=stream, default_flow_style=False) + self.to_dict(), stream=stream, default_flow_style=False) + + def to_json(self, stream=None): + return sjson.dump(self.to_dict(), stream) @staticmethod def from_node_dict(node): @@ -1245,22 +1251,13 @@ class Spec(object): yield dep_name, dag_hash, list(deptypes) @staticmethod - def from_yaml(stream): + def from_dict(data): """Construct a spec from YAML. Parameters: - stream -- string or file object to read from. - - TODO: currently discards hashes. Include hashes when they - represent more than the DAG does. - + data -- a nested dict/list data structure read from YAML or JSON. """ - try: - yfile = syaml.load(stream) - except MarkedYAMLError as e: - raise SpackYAMLError("error parsing YAML spec:", str(e)) - - nodes = yfile['spec'] + nodes = data['spec'] # Read nodes out of list. Root spec is the first element; # dependencies are the following elements. @@ -1285,6 +1282,32 @@ class Spec(object): return spec + @staticmethod + def from_yaml(stream): + """Construct a spec from YAML. + + Parameters: + stream -- string or file object to read from. + """ + try: + data = syaml.load(stream) + return Spec.from_dict(data) + except MarkedYAMLError as e: + raise syaml.SpackYAMLError("error parsing YAML spec:", str(e)) + + @staticmethod + def from_json(stream): + """Construct a spec from JSON. + + Parameters: + stream -- string or file object to read from. + """ + try: + data = sjson.load(stream) + return Spec.from_dict(data) + except Exception as e: + raise sjson.SpackJSONError("error parsing JSON spec:", str(e)) + def _concretize_helper(self, presets=None, visited=None): """Recursive helper function for concretize(). This concretizes everything bottom-up. As things are @@ -3064,11 +3087,6 @@ class UnsatisfiableDependencySpecError(UnsatisfiableSpecError): provided, required, "dependency") -class SpackYAMLError(spack.error.SpackError): - def __init__(self, msg, yaml_error): - super(SpackYAMLError, self).__init__(msg, str(yaml_error)) - - class AmbiguousHashError(SpecError): def __init__(self, msg, *specs): super(AmbiguousHashError, self).__init__(msg) diff --git a/lib/spack/spack/test/database.py b/lib/spack/spack/test/database.py index 1a1281e10e..55a1d84e20 100644 --- a/lib/spack/spack/test/database.py +++ b/lib/spack/spack/test/database.py @@ -75,7 +75,7 @@ class DatabaseTest(MockDatabase): def test_005_db_exists(self): """Make sure db cache file exists after creating.""" - index_file = join_path(self.install_path, '.spack-db', 'index.yaml') + index_file = join_path(self.install_path, '.spack-db', 'index.json') lock_file = join_path(self.install_path, '.spack-db', 'lock') self.assertTrue(os.path.exists(index_file)) diff --git a/lib/spack/spack/test/spec_dag.py b/lib/spack/spack/test/spec_dag.py index 0bc63bcf0f..dd536f945c 100644 --- a/lib/spack/spack/test/spec_dag.py +++ b/lib/spack/spack/test/spec_dag.py @@ -496,34 +496,6 @@ class SpecDagTest(MockPackagesTest): traversal = dag.traverse(deptype='run') self.assertEqual([x.name for x in traversal], names) - def test_using_ordered_dict(self): - """ Checks that dicts are ordered - - Necessary to make sure that dag_hash is stable across python - versions and processes. - """ - def descend_and_check(iterable, level=0): - from spack.util.spack_yaml import syaml_dict - from collections import Iterable, Mapping - if isinstance(iterable, Mapping): - self.assertTrue(isinstance(iterable, syaml_dict)) - return descend_and_check(iterable.values(), level=level + 1) - max_level = level - for value in iterable: - if isinstance(value, Iterable) and not isinstance(value, str): - nlevel = descend_and_check(value, level=level + 1) - if nlevel > max_level: - max_level = nlevel - return max_level - - specs = ['mpileaks ^zmpi', 'dttop', 'dtuse'] - for spec in specs: - dag = Spec(spec) - dag.normalize() - level = descend_and_check(dag.to_node_dict()) - # level just makes sure we are doing something here - self.assertTrue(level >= 5) - def test_hash_bits(self): """Ensure getting first n bits of a base32-encoded DAG hash works.""" diff --git a/lib/spack/spack/test/spec_yaml.py b/lib/spack/spack/test/spec_yaml.py index 964aea9422..442c6e6e81 100644 --- a/lib/spack/spack/test/spec_yaml.py +++ b/lib/spack/spack/test/spec_yaml.py @@ -27,6 +27,10 @@ YAML format preserves DAG informatoin in the spec. """ +import spack.util.spack_yaml as syaml +import spack.util.spack_json as sjson +from spack.util.spack_yaml import syaml_dict + from spack.spec import Spec from spack.test.mock_packages_test import * @@ -64,3 +68,123 @@ class SpecYamlTest(MockPackagesTest): for dep in ('callpath', 'mpich', 'dyninst', 'libdwarf', 'libelf'): self.assertTrue(spec[dep].eq_dag(yaml_spec[dep])) + + def test_using_ordered_dict(self): + """ Checks that dicts are ordered + + Necessary to make sure that dag_hash is stable across python + versions and processes. + """ + def descend_and_check(iterable, level=0): + from spack.util.spack_yaml import syaml_dict + from collections import Iterable, Mapping + if isinstance(iterable, Mapping): + self.assertTrue(isinstance(iterable, syaml_dict)) + return descend_and_check(iterable.values(), level=level + 1) + max_level = level + for value in iterable: + if isinstance(value, Iterable) and not isinstance(value, str): + nlevel = descend_and_check(value, level=level + 1) + if nlevel > max_level: + max_level = nlevel + return max_level + + specs = ['mpileaks ^zmpi', 'dttop', 'dtuse'] + for spec in specs: + dag = Spec(spec) + dag.normalize() + level = descend_and_check(dag.to_node_dict()) + # level just makes sure we are doing something here + self.assertTrue(level >= 5) + + def test_ordered_read_not_required_for_consistent_dag_hash(self): + """Make sure ordered serialization isn't required to preserve hashes. + + For consistent hashes, we require that YAML and json documents + have their keys serialized in a deterministic order. However, we + don't want to require them to be serialized in order. This + ensures that is not reauired. + + """ + specs = ['mpileaks ^zmpi', 'dttop', 'dtuse'] + for spec in specs: + spec = Spec(spec) + spec.concretize() + + # + # Dict & corresponding YAML & JSON from the original spec. + # + spec_dict = spec.to_dict() + spec_yaml = spec.to_yaml() + spec_json = spec.to_json() + + # + # Make a spec with reversed OrderedDicts for every + # OrderedDict in the original. + # + reversed_spec_dict = reverse_all_dicts(spec.to_dict()) + + # + # Dump to YAML and JSON + # + yaml_string = syaml.dump(spec_dict, default_flow_style=False) + reversed_yaml_string = syaml.dump(reversed_spec_dict, + default_flow_style=False) + json_string = sjson.dump(spec_dict) + reversed_json_string = sjson.dump(reversed_spec_dict) + + # + # Do many consistency checks + # + + # spec yaml is ordered like the spec dict + self.assertEqual(yaml_string, spec_yaml) + self.assertEqual(json_string, spec_json) + + # reversed string is different from the original, so it + # *would* generate a different hash + self.assertNotEqual(yaml_string, reversed_yaml_string) + self.assertNotEqual(json_string, reversed_json_string) + + # build specs from the "wrongly" ordered data + round_trip_yaml_spec = Spec.from_yaml(yaml_string) + round_trip_json_spec = Spec.from_json(json_string) + round_trip_reversed_yaml_spec = Spec.from_yaml( + reversed_yaml_string) + round_trip_reversed_json_spec = Spec.from_yaml( + reversed_json_string) + + # TODO: remove this when build deps are in provenance. + spec = spec.copy(deps=('link', 'run')) + + # specs are equal to the original + self.assertEqual(spec, round_trip_yaml_spec) + self.assertEqual(spec, round_trip_json_spec) + self.assertEqual(spec, round_trip_reversed_yaml_spec) + self.assertEqual(spec, round_trip_reversed_json_spec) + self.assertEqual(round_trip_yaml_spec, + round_trip_reversed_yaml_spec) + self.assertEqual(round_trip_json_spec, + round_trip_reversed_json_spec) + + # dag_hashes are equal + self.assertEqual( + spec.dag_hash(), round_trip_yaml_spec.dag_hash()) + self.assertEqual( + spec.dag_hash(), round_trip_json_spec.dag_hash()) + self.assertEqual( + spec.dag_hash(), round_trip_reversed_yaml_spec.dag_hash()) + self.assertEqual( + spec.dag_hash(), round_trip_reversed_json_spec.dag_hash()) + + +def reverse_all_dicts(data): + """Descend into data and reverse all the dictionaries""" + if isinstance(data, dict): + return syaml_dict(reversed( + [(reverse_all_dicts(k), reverse_all_dicts(v)) + for k, v in data.items()])) + elif isinstance(data, (list, tuple)): + return type(data)(reverse_all_dicts(elt) for elt in data) + else: + return data diff --git a/lib/spack/spack/util/spack_json.py b/lib/spack/spack/util/spack_json.py new file mode 100644 index 0000000000..240ce86c68 --- /dev/null +++ b/lib/spack/spack/util/spack_json.py @@ -0,0 +1,56 @@ +############################################################################## +# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# +# This file is part of Spack. +# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved. +# LLNL-CODE-647188 +# +# For details, see https://github.com/llnl/spack +# Please also see the LICENSE file for our notice and the LGPL. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License (as +# published by the Free Software Foundation) version 2.1, February 1999. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and +# conditions of the GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +############################################################################## +"""Simple wrapper around JSON to guarantee consistent use of load/dump. """ +import json +import spack.error + +__all__ = ['load', 'dump', 'SpackJSONError'] + +_json_dump_args = { + 'indent': True, + 'separators': (',', ': ') +} + + +def load(stream): + """Spack JSON needs to be ordered to support specs.""" + if isinstance(stream, basestring): + return json.loads(stream) + else: + return json.load(stream) + + +def dump(data, stream=None): + """Dump JSON with a reasonable amount of indentation and separation.""" + if stream is None: + return json.dumps(data, **_json_dump_args) + else: + return json.dump(data, stream, **_json_dump_args) + + +class SpackJSONError(spack.error.SpackError): + """Raised when there are issues with JSON parsing.""" + def __init__(self, msg, yaml_error): + super(SpackJSONError, self).__init__(msg, str(yaml_error)) diff --git a/lib/spack/spack/util/spack_yaml.py b/lib/spack/spack/util/spack_yaml.py index c27db52066..9d4c607908 100644 --- a/lib/spack/spack/util/spack_yaml.py +++ b/lib/spack/spack/util/spack_yaml.py @@ -32,23 +32,21 @@ """ import yaml -try: - from yaml import CLoader as Loader, CDumper as Dumper -except ImportError as e: - from yaml import Loader, Dumper +from yaml import Loader, Dumper from yaml.nodes import * from yaml.constructor import ConstructorError from ordereddict_backport import OrderedDict +import spack.error + # Only export load and dump -__all__ = ['load', 'dump'] +__all__ = ['load', 'dump', 'SpackYAMLError'] # Make new classes so we can add custom attributes. # Also, use OrderedDict instead of just dict. class syaml_dict(OrderedDict): - def __repr__(self): mappings = ('%r: %r' % (k, v) for k, v in self.items()) return '{%s}' % ', '.join(mappings) @@ -153,6 +151,7 @@ class OrderedLineLoader(Loader): mark(mapping, node) return mapping + # register above new constructors OrderedLineLoader.add_constructor( u'tag:yaml.org,2002:map', OrderedLineLoader.construct_yaml_map) @@ -223,3 +222,9 @@ def load(*args, **kwargs): def dump(*args, **kwargs): kwargs['Dumper'] = OrderedLineDumper return yaml.dump(*args, **kwargs) + + +class SpackYAMLError(spack.error.SpackError): + """Raised when there are issues with YAML parsing.""" + def __init__(self, msg, yaml_error): + super(SpackYAMLError, self).__init__(msg, str(yaml_error)) -- cgit v1.2.3-70-g09d2