summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHarmen Stoppels <me@harmenstoppels.nl>2024-07-02 14:00:19 +0200
committerGitHub <noreply@github.com>2024-07-02 14:00:19 +0200
commit2e8b4e660ee23f3ddea490a681f3042a5ae55a57 (patch)
treef34636bd6de6101db175fe89c789fed1d3d0f9c0
parent0ca1ee8b91eec07a35b9b9f626945079a2d00bd5 (diff)
downloadspack-2e8b4e660ee23f3ddea490a681f3042a5ae55a57.tar.gz
spack-2e8b4e660ee23f3ddea490a681f3042a5ae55a57.tar.bz2
spack-2e8b4e660ee23f3ddea490a681f3042a5ae55a57.tar.xz
spack-2e8b4e660ee23f3ddea490a681f3042a5ae55a57.zip
spack_yaml: add anchorify function (#44995)
This adds spack.util.spack_yaml.anchorify, which takes a non-cyclic dict/list structure, and replaces identical values with (back) references to the first instance, so that yaml serialization will use anchors. `repr` is used to identify sub-dags, which in principle is quadratic complexity in depth of the graph, but in practice the depth is O(1) so this should not matter. Then this is used in CI to reduce the size of generated YAML files to 30% of their original size.
-rw-r--r--lib/spack/spack/ci.py9
-rw-r--r--lib/spack/spack/test/spec_yaml.py49
-rw-r--r--lib/spack/spack/util/spack_yaml.py25
3 files changed, 80 insertions, 3 deletions
diff --git a/lib/spack/spack/ci.py b/lib/spack/spack/ci.py
index bd664c664d..7237c95e69 100644
--- a/lib/spack/spack/ci.py
+++ b/lib/spack/spack/ci.py
@@ -22,6 +22,8 @@ from urllib.error import HTTPError, URLError
from urllib.parse import urlencode
from urllib.request import HTTPHandler, Request, build_opener
+import ruamel.yaml
+
import llnl.util.filesystem as fs
import llnl.util.tty as tty
from llnl.util.lang import memoized
@@ -1310,8 +1312,11 @@ def generate_gitlab_ci_yaml(
if not rebuild_everything:
sys.exit(1)
- with open(output_file, "w") as outf:
- outf.write(syaml.dump(sorted_output, default_flow_style=True))
+ # Minimize yaml output size through use of anchors
+ syaml.anchorify(sorted_output)
+
+ with open(output_file, "w") as f:
+ ruamel.yaml.YAML().dump(sorted_output, f)
def _url_encode_string(input_string):
diff --git a/lib/spack/spack/test/spec_yaml.py b/lib/spack/spack/test/spec_yaml.py
index 5dd854628a..df63fdf72e 100644
--- a/lib/spack/spack/test/spec_yaml.py
+++ b/lib/spack/spack/test/spec_yaml.py
@@ -13,10 +13,12 @@ import collections
import collections.abc
import gzip
import inspect
+import io
import json
import os
import pytest
+import ruamel.yaml
import spack.hash_types as ht
import spack.paths
@@ -505,3 +507,50 @@ def test_load_json_specfiles(specfile, expected_hash, reader_cls):
# JSON or YAML file, not a list
for edge in s2.traverse_edges():
assert isinstance(edge.virtuals, tuple), edge
+
+
+def test_anchorify_1():
+ """Test that anchorify replaces duplicate values with references to a single instance, and
+ that that results in anchors in the output YAML."""
+ before = {"a": [1, 2, 3], "b": [1, 2, 3]}
+ after = {"a": [1, 2, 3], "b": [1, 2, 3]}
+ syaml.anchorify(after)
+ assert before == after
+ assert after["a"] is after["b"]
+
+ # Check if anchors are used
+ out = io.StringIO()
+ ruamel.yaml.YAML().dump(after, out)
+ assert (
+ out.getvalue()
+ == """\
+a: &id001
+- 1
+- 2
+- 3
+b: *id001
+"""
+ )
+
+
+def test_anchorify_2():
+ before = {"a": {"b": {"c": True}}, "d": {"b": {"c": True}}, "e": {"c": True}}
+ after = {"a": {"b": {"c": True}}, "d": {"b": {"c": True}}, "e": {"c": True}}
+ syaml.anchorify(after)
+ assert before == after
+ assert after["a"] is after["d"]
+ assert after["a"]["b"] is after["e"]
+
+ # Check if anchors are used
+ out = io.StringIO()
+ ruamel.yaml.YAML().dump(after, out)
+ assert (
+ out.getvalue()
+ == """\
+a: &id001
+ b: &id002
+ c: true
+d: *id001
+e: *id002
+"""
+ )
diff --git a/lib/spack/spack/util/spack_yaml.py b/lib/spack/spack/util/spack_yaml.py
index 9dadc25d82..200025411e 100644
--- a/lib/spack/spack/util/spack_yaml.py
+++ b/lib/spack/spack/util/spack_yaml.py
@@ -20,7 +20,7 @@ import enum
import functools
import io
import re
-from typing import IO, List, Optional
+from typing import IO, Any, Callable, Dict, List, Optional, Union
import ruamel.yaml
from ruamel.yaml import comments, constructor, emitter, error, representer
@@ -493,6 +493,29 @@ def name_mark(name):
return error.StringMark(name, None, None, None, None, None)
+def anchorify(data: Union[dict, list], identifier: Callable[[Any], str] = repr) -> None:
+ """Replace identical dict/list branches in tree with references to earlier instances. The YAML
+ serializer generate anchors for them, resulting in small yaml files."""
+ anchors: Dict[str, Union[dict, list]] = {}
+ queue: List[Union[dict, list]] = [data]
+
+ while queue:
+ item = queue.pop()
+
+ for key, value in item.items() if isinstance(item, dict) else enumerate(item):
+ if not isinstance(value, (dict, list)):
+ continue
+
+ id = identifier(value)
+ anchor = anchors.get(id)
+
+ if anchor is None:
+ anchors[id] = value
+ queue.append(value)
+ else:
+ item[key] = anchor # replace with reference
+
+
class SpackYAMLError(spack.error.SpackError):
"""Raised when there are issues with YAML parsing."""