summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorOmar Padron <omar.padron@kitware.com>2020-06-22 13:19:47 -0400
committerGitHub <noreply@github.com>2020-06-22 13:19:47 -0400
commit224dc951597253f426f16132eab43701a9f1c53e (patch)
tree23c2dc5276cffb84f9e6a646ca17c1d706148699 /lib
parent42f2c168d2374a041d9bbe7b2246a8cae56bb369 (diff)
downloadspack-224dc951597253f426f16132eab43701a9f1c53e.tar.gz
spack-224dc951597253f426f16132eab43701a9f1c53e.tar.bz2
spack-224dc951597253f426f16132eab43701a9f1c53e.tar.xz
spack-224dc951597253f426f16132eab43701a9f1c53e.zip
Pre ci optimization (#16372)
* add initial optimization script * integrate optimization in spack ci * make optimization opt-in * fix import error * flake8 fixes * update command completion * work around vermin errors * fix sphynx errors
Diffstat (limited to 'lib')
-rw-r--r--lib/spack/spack/ci.py8
-rw-r--r--lib/spack/spack/ci_optimization.py377
-rw-r--r--lib/spack/spack/cmd/ci.py9
3 files changed, 392 insertions, 2 deletions
diff --git a/lib/spack/spack/ci.py b/lib/spack/spack/ci.py
index 9ba7f39e45..ce74abf29c 100644
--- a/lib/spack/spack/ci.py
+++ b/lib/spack/spack/ci.py
@@ -449,7 +449,8 @@ def format_job_needs(phase_name, strip_compilers, dep_jobs,
def generate_gitlab_ci_yaml(env, print_summary, output_file,
- custom_spack_repo=None, custom_spack_ref=None):
+ custom_spack_repo=None, custom_spack_ref=None,
+ run_optimizer=False):
# FIXME: What's the difference between one that opens with 'spack'
# and one that opens with 'env'? This will only handle the former.
with spack.concretize.disable_compiler_existence_check():
@@ -788,6 +789,11 @@ def generate_gitlab_ci_yaml(env, print_summary, output_file,
for output_key, output_value in sorted(output_object.items()):
sorted_output[output_key] = output_value
+ # TODO(opadron): remove this or refactor
+ if run_optimizer:
+ import spack.ci_optimization as ci_opt
+ sorted_output = ci_opt.optimizer(sorted_output)
+
with open(output_file, 'w') as outf:
outf.write(syaml.dump_config(sorted_output, default_flow_style=True))
diff --git a/lib/spack/spack/ci_optimization.py b/lib/spack/spack/ci_optimization.py
new file mode 100644
index 0000000000..693802d06d
--- /dev/null
+++ b/lib/spack/spack/ci_optimization.py
@@ -0,0 +1,377 @@
+# Copyright 2013-2020 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+import collections
+
+try:
+ # dynamically import to keep vermin from complaining
+ collections_abc = __import__('collections.abc')
+except ImportError:
+ collections_abc = collections
+
+import copy
+import hashlib
+
+import spack.util.spack_yaml as syaml
+
+
+def matches(obj, proto):
+ """Returns True if the test object "obj" matches the prototype object
+ "proto".
+
+ If obj and proto are mappings, obj matches proto if (key in obj) and
+ (obj[key] matches proto[key]) for every key in proto.
+
+ If obj and proto are sequences, obj matches proto if they are of the same
+ length and (a matches b) for every (a,b) in zip(obj, proto).
+
+ Otherwise, obj matches proto if obj == proto.
+
+ Precondition: proto must not have any reference cycles
+ """
+ if isinstance(obj, collections_abc.Mapping):
+ if not isinstance(proto, collections_abc.Mapping):
+ return False
+
+ return all(
+ (key in obj and matches(obj[key], val))
+ for key, val in proto.items()
+ )
+
+ if (isinstance(obj, collections_abc.Sequence) and
+ not isinstance(obj, str)):
+
+ if not (isinstance(proto, collections_abc.Sequence) and
+ not isinstance(proto, str)):
+ return False
+
+ if len(obj) != len(proto):
+ return False
+
+ return all(
+ matches(obj[index], val)
+ for index, val in enumerate(proto)
+ )
+
+ return obj == proto
+
+
+def subkeys(obj, proto):
+ """Returns the test mapping "obj" after factoring out the items it has in
+ common with the prototype mapping "proto".
+
+ Consider a recursive merge operation, merge(a, b) on mappings a and b, that
+ returns a mapping, m, whose keys are the union of the keys of a and b, and
+ for every such key, "k", its corresponding value is:
+
+ - merge(a[key], b[key]) if a[key] and b[key] are mappings, or
+ - b[key] if (key in b) and not matches(a[key], b[key]),
+ or
+ - a[key] otherwise
+
+
+ If obj and proto are mappings, the returned object is the smallest object,
+ "a", such that merge(a, proto) matches obj.
+
+ Otherwise, obj is returned.
+ """
+ if not (isinstance(obj, collections_abc.Mapping) and
+ isinstance(proto, collections_abc.Mapping)):
+ return obj
+
+ new_obj = {}
+ for key, value in obj.items():
+ if key not in proto:
+ new_obj[key] = value
+ continue
+
+ if (matches(value, proto[key]) and
+ matches(proto[key], value)):
+ continue
+
+ if isinstance(value, collections_abc.Mapping):
+ new_obj[key] = subkeys(value, proto[key])
+ continue
+
+ new_obj[key] = value
+
+ return new_obj
+
+
+def add_extends(yaml, key):
+ """Modifies the given object "yaml" so that it includes an "extends" key
+ whose value features "key".
+
+ If "extends" is not in yaml, then yaml is modified such that
+ yaml["extends"] == key.
+
+ If yaml["extends"] is a str, then yaml is modified such that
+ yaml["extends"] == [yaml["extends"], key]
+
+ If yaml["extends"] is a list that does not include key, then key is
+ appended to the list.
+
+ Otherwise, yaml is left unchanged.
+ """
+
+ has_key = ('extends' in yaml)
+ extends = yaml.get('extends')
+
+ if has_key and not isinstance(extends, (str, collections_abc.Sequence)):
+ return
+
+ if extends is None:
+ yaml['extends'] = key
+ return
+
+ if isinstance(extends, str):
+ if extends != key:
+ yaml['extends'] = [extends, key]
+ return
+
+ if key not in extends:
+ extends.append(key)
+
+
+def common_subobject(yaml, sub):
+ """Factor prototype object "sub" out of the values of mapping "yaml".
+
+ Consider a modified copy of yaml, "new", where for each key, "key" in yaml:
+
+ - If yaml[key] matches sub, then new[key] = subkeys(yaml[key], sub).
+ - Otherwise, new[key] = yaml[key].
+
+ If the above match criteria is not satisfied for any such key, then (yaml,
+ None) is returned. The yaml object is returned unchanged.
+
+ Otherwise, each matching value in new is modified as in
+ add_extends(new[key], common_key), and then new[common_key] is set to sub.
+ The common_key value is chosen such that it does not match any preexisting
+ key in new. In this case, (new, common_key) is returned.
+ """
+ match_list = set(k for k, v in yaml.items() if matches(v, sub))
+
+ if not match_list:
+ return yaml, None
+
+ common_prefix = '.c'
+ common_index = 0
+
+ while True:
+ common_key = ''.join((common_prefix, str(common_index)))
+ if common_key not in yaml:
+ break
+ common_index += 1
+
+ new_yaml = {}
+
+ for key, val in yaml.items():
+ new_yaml[key] = copy.deepcopy(val)
+
+ if not matches(val, sub):
+ continue
+
+ new_yaml[key] = subkeys(new_yaml[key], sub)
+ add_extends(new_yaml[key], common_key)
+
+ new_yaml[common_key] = sub
+
+ return new_yaml, common_key
+
+
+def print_delta(name, old, new, applied=None):
+ delta = new - old
+ reldelta = (1000 * delta) // old
+ reldelta = (reldelta // 10, reldelta % 10)
+
+ if applied is None:
+ applied = (new <= old)
+
+ print('\n'.join((
+ '{} {}:',
+ ' before: {: 10d}',
+ ' after : {: 10d}',
+ ' delta : {:+10d} ({:=+3d}.{}%)',
+ )).format(
+ name,
+ ('+' if applied else 'x'),
+ old,
+ new,
+ delta,
+ reldelta[0],
+ reldelta[1]
+ ))
+
+
+def try_optimization_pass(name, yaml, optimization_pass, *args, **kwargs):
+ """Try applying an optimization pass and return information about the
+ result
+
+ "name" is a string describing the nature of the pass. If it is a non-empty
+ string, summary statistics are also printed to stdout.
+
+ "yaml" is the object to apply the pass to.
+
+ "optimization_pass" is the function implementing the pass to be applied.
+
+ "args" and "kwargs" are the additional arguments to pass to optimization
+ pass. The pass is applied as
+
+ >>> (new_yaml, *other_results) = optimization_pass(yaml, *args, **kwargs)
+
+ The pass's results are greedily rejected if it does not modify the original
+ yaml document, or if it produces a yaml document that serializes to a
+ larger string.
+
+ Returns (new_yaml, yaml, applied, other_results) if applied, or
+ (yaml, new_yaml, applied, other_results) otherwise.
+ """
+ result = optimization_pass(yaml, *args, **kwargs)
+ new_yaml, other_results = result[0], result[1:]
+
+ if new_yaml is yaml:
+ # pass was not applied
+ return (yaml, new_yaml, False, other_results)
+
+ pre_size = len(syaml.dump_config(yaml, default_flow_style=True))
+ post_size = len(syaml.dump_config(new_yaml, default_flow_style=True))
+
+ # pass makes the size worse: not applying
+ applied = (post_size <= pre_size)
+ if applied:
+ yaml, new_yaml = new_yaml, yaml
+
+ if name:
+ print_delta(name, pre_size, post_size, applied)
+
+ return (yaml, new_yaml, applied, other_results)
+
+
+def build_histogram(iterator, key):
+ """Builds a histogram of values given an iterable of mappings and a key.
+
+ For each mapping "m" with key "key" in iterator, the value m[key] is
+ considered.
+
+ Returns a list of tuples (hash, count, proportion, value), where
+
+ - "hash" is a sha1sum hash of the value.
+ - "count" is the number of occurences of values that hash to "hash".
+ - "proportion" is the proportion of all values considered above that
+ hash to "hash".
+ - "value" is one of the values considered above that hash to "hash".
+ Which value is chosen when multiple values hash to the same "hash" is
+ undefined.
+
+ The list is sorted in descending order by count, yielding the most
+ frequently occuring hashes first.
+ """
+ buckets = collections.defaultdict(int)
+ values = {}
+
+ num_objects = 0
+ for obj in iterator:
+ num_objects += 1
+
+ try:
+ val = obj[key]
+ except (KeyError, TypeError):
+ continue
+
+ value_hash = hashlib.sha1()
+ value_hash.update(syaml.dump_config(val).encode())
+ value_hash = value_hash.hexdigest()
+
+ buckets[value_hash] += 1
+ values[value_hash] = val
+
+ return [(h, buckets[h], float(buckets[h]) / num_objects, values[h])
+ for h in sorted(buckets.keys(), key=lambda k: -buckets[k])]
+
+
+def optimizer(yaml):
+ original_size = len(syaml.dump_config(yaml, default_flow_style=True))
+
+ # try factoring out commonly repeated portions
+ common_job = {
+ 'variables': {
+ 'SPACK_COMPILER_ACTION': 'NONE',
+ 'SPACK_RELATED_BUILDS_CDASH': ''
+ },
+
+ 'after_script': ['rm -rf "./spack"'],
+
+ 'artifacts': {
+ 'paths': ['jobs_scratch_dir', 'cdash_report'],
+ 'when': 'always'
+ },
+ }
+
+ # look for a list of tags that appear frequently
+ _, count, proportion, tags = next(iter(
+ build_histogram(yaml.values(), 'tags')),
+ (None,) * 4)
+
+ # If a list of tags is found, and there are more than one job that uses it,
+ # *and* the jobs that do use it represent at least 70% of all jobs, then
+ # add the list to the prototype object.
+ if tags and count > 1 and proportion >= 0.70:
+ common_job['tags'] = tags
+
+ # apply common object factorization
+ yaml, other, applied, rest = try_optimization_pass(
+ 'general common object factorization',
+ yaml, common_subobject, common_job)
+
+ # look for a common script, and try factoring that out
+ _, count, proportion, script = next(iter(
+ build_histogram(yaml.values(), 'script')),
+ (None,) * 4)
+
+ if script and count > 1 and proportion >= 0.70:
+ yaml, other, applied, rest = try_optimization_pass(
+ 'script factorization',
+ yaml, common_subobject, {'script': script})
+
+ # look for a common before_script, and try factoring that out
+ _, count, proportion, script = next(iter(
+ build_histogram(yaml.values(), 'before_script')),
+ (None,) * 4)
+
+ if script and count > 1 and proportion >= 0.70:
+ yaml, other, applied, rest = try_optimization_pass(
+ 'before_script factorization',
+ yaml, common_subobject, {'before_script': script})
+
+ # Look specifically for the SPACK_ROOT_SPEC environment variables.
+ # Try to factor them out.
+ h = build_histogram((
+ getattr(val, 'get', lambda *args: {})('variables')
+ for val in yaml.values()), 'SPACK_ROOT_SPEC')
+
+ # In this case, we try to factor out *all* instances of the SPACK_ROOT_SPEC
+ # environment variable; not just the one that appears with the greatest
+ # frequency. We only require that more than 1 job uses a given instance's
+ # value, because we expect the value to be very large, and so expect even
+ # few-to-one factorizations to yield large space savings.
+ counter = 0
+ for _, count, proportion, spec in h:
+ if count <= 1:
+ continue
+
+ counter += 1
+
+ yaml, other, applied, rest = try_optimization_pass(
+ 'SPACK_ROOT_SPEC factorization ({count})'.format(count=counter),
+ yaml,
+ common_subobject,
+ {'variables': {'SPACK_ROOT_SPEC': spec}})
+
+ new_size = len(syaml.dump_config(yaml, default_flow_style=True))
+
+ print('\n')
+ print_delta('overall summary', original_size, new_size)
+ print('\n')
+ return yaml
diff --git a/lib/spack/spack/cmd/ci.py b/lib/spack/spack/cmd/ci.py
index 35891edb13..3e57c6656a 100644
--- a/lib/spack/spack/cmd/ci.py
+++ b/lib/spack/spack/cmd/ci.py
@@ -54,6 +54,11 @@ def setup_parser(subparser):
help="Provide a git branch or tag if a custom spack branch " +
"should be checked out as a step in each generated job. " +
"This argument is ignored if no --spack-repo is provided.")
+ generate.add_argument(
+ '--optimize', action='store_true',
+ help="(Experimental) run the generated document through a series of "
+ "optimization passes designed to reduce the size of the "
+ "generated file.")
generate.set_defaults(func=ci_generate)
# Check a spec against mirror. Rebuild, create buildcache and push to
@@ -75,6 +80,7 @@ def ci_generate(args):
copy_yaml_to = args.copy_to
spack_repo = args.spack_repo
spack_ref = args.spack_ref
+ run_optimizer = args.optimize
if not output_file:
gen_ci_dir = os.getcwd()
@@ -86,7 +92,8 @@ def ci_generate(args):
# Generate the jobs
spack_ci.generate_gitlab_ci_yaml(
- env, True, output_file, spack_repo, spack_ref)
+ env, True, output_file, spack_repo, spack_ref,
+ run_optimizer=run_optimizer)
if copy_yaml_to:
copy_to_dir = os.path.dirname(copy_yaml_to)