From 224dc951597253f426f16132eab43701a9f1c53e Mon Sep 17 00:00:00 2001
From: Omar Padron <omar.padron@kitware.com>
Date: Mon, 22 Jun 2020 13:19:47 -0400
Subject: Pre ci optimization (#16372)

* add initial optimization script

* integrate optimization in spack ci

* make optimization opt-in

* fix import error

* flake8 fixes

* update command completion

* work around vermin errors

* fix sphynx errors
---
 lib/spack/spack/ci.py              |   8 +-
 lib/spack/spack/ci_optimization.py | 377 +++++++++++++++++++++++++++++++++++++
 lib/spack/spack/cmd/ci.py          |   9 +-
 share/spack/spack-completion.bash  |   2 +-
 4 files changed, 393 insertions(+), 3 deletions(-)
 create mode 100644 lib/spack/spack/ci_optimization.py

diff --git a/lib/spack/spack/ci.py b/lib/spack/spack/ci.py
index 9ba7f39e45..ce74abf29c 100644
--- a/lib/spack/spack/ci.py
+++ b/lib/spack/spack/ci.py
@@ -449,7 +449,8 @@ def format_job_needs(phase_name, strip_compilers, dep_jobs,
 
 
 def generate_gitlab_ci_yaml(env, print_summary, output_file,
-                            custom_spack_repo=None, custom_spack_ref=None):
+                            custom_spack_repo=None, custom_spack_ref=None,
+                            run_optimizer=False):
     # FIXME: What's the difference between one that opens with 'spack'
     # and one that opens with 'env'?  This will only handle the former.
     with spack.concretize.disable_compiler_existence_check():
@@ -788,6 +789,11 @@ def generate_gitlab_ci_yaml(env, print_summary, output_file,
     for output_key, output_value in sorted(output_object.items()):
         sorted_output[output_key] = output_value
 
+    # TODO(opadron): remove this or refactor
+    if run_optimizer:
+        import spack.ci_optimization as ci_opt
+        sorted_output = ci_opt.optimizer(sorted_output)
+
     with open(output_file, 'w') as outf:
         outf.write(syaml.dump_config(sorted_output, default_flow_style=True))
 
diff --git a/lib/spack/spack/ci_optimization.py b/lib/spack/spack/ci_optimization.py
new file mode 100644
index 0000000000..693802d06d
--- /dev/null
+++ b/lib/spack/spack/ci_optimization.py
@@ -0,0 +1,377 @@
+# Copyright 2013-2020 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+import collections
+
+try:
+    # dynamically import to keep vermin from complaining
+    collections_abc = __import__('collections.abc')
+except ImportError:
+    collections_abc = collections
+
+import copy
+import hashlib
+
+import spack.util.spack_yaml as syaml
+
+
+def matches(obj, proto):
+    """Returns True if the test object "obj" matches the prototype object
+    "proto".
+
+    If obj and proto are mappings, obj matches proto if (key in obj) and
+    (obj[key] matches proto[key]) for every key in proto.
+
+    If obj and proto are sequences, obj matches proto if they are of the same
+    length and (a matches b) for every (a,b) in zip(obj, proto).
+
+    Otherwise, obj matches proto if obj == proto.
+
+    Precondition: proto must not have any reference cycles
+    """
+    if isinstance(obj, collections_abc.Mapping):
+        if not isinstance(proto, collections_abc.Mapping):
+            return False
+
+        return all(
+            (key in obj and matches(obj[key], val))
+            for key, val in proto.items()
+        )
+
+    if (isinstance(obj, collections_abc.Sequence) and
+            not isinstance(obj, str)):
+
+        if not (isinstance(proto, collections_abc.Sequence) and
+                not isinstance(proto, str)):
+            return False
+
+        if len(obj) != len(proto):
+            return False
+
+        return all(
+            matches(obj[index], val)
+            for index, val in enumerate(proto)
+        )
+
+    return obj == proto
+
+
+def subkeys(obj, proto):
+    """Returns the test mapping "obj" after factoring out the items it has in
+    common with the prototype mapping "proto".
+
+    Consider a recursive merge operation, merge(a, b) on mappings a and b, that
+    returns a mapping, m, whose keys are the union of the keys of a and b, and
+    for every such key, "k", its corresponding value is:
+
+      - merge(a[key], b[key])  if a[key] and b[key] are mappings, or
+      - b[key]                 if (key in b) and not matches(a[key], b[key]),
+                               or
+      - a[key]                 otherwise
+
+
+    If obj and proto are mappings, the returned object is the smallest object,
+    "a", such that merge(a, proto) matches obj.
+
+    Otherwise, obj is returned.
+    """
+    if not (isinstance(obj, collections_abc.Mapping) and
+            isinstance(proto, collections_abc.Mapping)):
+        return obj
+
+    new_obj = {}
+    for key, value in obj.items():
+        if key not in proto:
+            new_obj[key] = value
+            continue
+
+        if (matches(value, proto[key]) and
+            matches(proto[key], value)):
+            continue
+
+        if isinstance(value, collections_abc.Mapping):
+            new_obj[key] = subkeys(value, proto[key])
+            continue
+
+        new_obj[key] = value
+
+    return new_obj
+
+
+def add_extends(yaml, key):
+    """Modifies the given object "yaml" so that it includes an "extends" key
+    whose value features "key".
+
+    If "extends" is not in yaml, then yaml is modified such that
+    yaml["extends"] == key.
+
+    If yaml["extends"] is a str, then yaml is modified such that
+    yaml["extends"] == [yaml["extends"], key]
+
+    If yaml["extends"] is a list that does not include key, then key is
+    appended to the list.
+
+    Otherwise, yaml is left unchanged.
+    """
+
+    has_key = ('extends' in yaml)
+    extends = yaml.get('extends')
+
+    if has_key and not isinstance(extends, (str, collections_abc.Sequence)):
+        return
+
+    if extends is None:
+        yaml['extends'] = key
+        return
+
+    if isinstance(extends, str):
+        if extends != key:
+            yaml['extends'] = [extends, key]
+        return
+
+    if key not in extends:
+        extends.append(key)
+
+
+def common_subobject(yaml, sub):
+    """Factor prototype object "sub" out of the values of mapping "yaml".
+
+    Consider a modified copy of yaml, "new", where for each key, "key" in yaml:
+
+      - If yaml[key] matches sub, then new[key] = subkeys(yaml[key], sub).
+      - Otherwise, new[key] = yaml[key].
+
+    If the above match criteria is not satisfied for any such key, then (yaml,
+    None) is returned. The yaml object is returned unchanged.
+
+    Otherwise, each matching value in new is modified as in
+    add_extends(new[key], common_key), and then new[common_key] is set to sub.
+    The common_key value is chosen such that it does not match any preexisting
+    key in new. In this case, (new, common_key) is returned.
+    """
+    match_list = set(k for k, v in yaml.items() if matches(v, sub))
+
+    if not match_list:
+        return yaml, None
+
+    common_prefix = '.c'
+    common_index = 0
+
+    while True:
+        common_key = ''.join((common_prefix, str(common_index)))
+        if common_key not in yaml:
+            break
+        common_index += 1
+
+    new_yaml = {}
+
+    for key, val in yaml.items():
+        new_yaml[key] = copy.deepcopy(val)
+
+        if not matches(val, sub):
+            continue
+
+        new_yaml[key] = subkeys(new_yaml[key], sub)
+        add_extends(new_yaml[key], common_key)
+
+    new_yaml[common_key] = sub
+
+    return new_yaml, common_key
+
+
+def print_delta(name, old, new, applied=None):
+    delta = new - old
+    reldelta = (1000 * delta) // old
+    reldelta = (reldelta // 10, reldelta % 10)
+
+    if applied is None:
+        applied = (new <= old)
+
+    print('\n'.join((
+        '{} {}:',
+        '  before: {: 10d}',
+        '  after : {: 10d}',
+        '  delta : {:+10d} ({:=+3d}.{}%)',
+    )).format(
+        name,
+        ('+' if applied else 'x'),
+        old,
+        new,
+        delta,
+        reldelta[0],
+        reldelta[1]
+    ))
+
+
+def try_optimization_pass(name, yaml, optimization_pass, *args, **kwargs):
+    """Try applying an optimization pass and return information about the
+    result
+
+    "name" is a string describing the nature of the pass. If it is a non-empty
+    string, summary statistics are also printed to stdout.
+
+    "yaml" is the object to apply the pass to.
+
+    "optimization_pass" is the function implementing the pass to be applied.
+
+    "args" and "kwargs" are the additional arguments to pass to optimization
+    pass. The pass is applied as
+
+    >>> (new_yaml, *other_results) = optimization_pass(yaml, *args, **kwargs)
+
+    The pass's results are greedily rejected if it does not modify the original
+    yaml document, or if it produces a yaml document that serializes to a
+    larger string.
+
+    Returns (new_yaml, yaml, applied, other_results) if applied, or
+    (yaml, new_yaml, applied, other_results) otherwise.
+    """
+    result = optimization_pass(yaml, *args, **kwargs)
+    new_yaml, other_results = result[0], result[1:]
+
+    if new_yaml is yaml:
+        # pass was not applied
+        return (yaml, new_yaml, False, other_results)
+
+    pre_size = len(syaml.dump_config(yaml, default_flow_style=True))
+    post_size = len(syaml.dump_config(new_yaml, default_flow_style=True))
+
+    # pass makes the size worse: not applying
+    applied = (post_size <= pre_size)
+    if applied:
+        yaml, new_yaml = new_yaml, yaml
+
+    if name:
+        print_delta(name, pre_size, post_size, applied)
+
+    return (yaml, new_yaml, applied, other_results)
+
+
+def build_histogram(iterator, key):
+    """Builds a histogram of values given an iterable of mappings and a key.
+
+    For each mapping "m" with key "key" in iterator, the value m[key] is
+    considered.
+
+    Returns a list of tuples (hash, count, proportion, value), where
+
+      - "hash" is a sha1sum hash of the value.
+      - "count" is the number of occurences of values that hash to "hash".
+      - "proportion" is the proportion of all values considered above that
+        hash to "hash".
+      - "value" is one of the values considered above that hash to "hash".
+        Which value is chosen when multiple values hash to the same "hash" is
+        undefined.
+
+    The list is sorted in descending order by count, yielding the most
+    frequently occuring hashes first.
+    """
+    buckets = collections.defaultdict(int)
+    values = {}
+
+    num_objects = 0
+    for obj in iterator:
+        num_objects += 1
+
+        try:
+            val = obj[key]
+        except (KeyError, TypeError):
+            continue
+
+        value_hash = hashlib.sha1()
+        value_hash.update(syaml.dump_config(val).encode())
+        value_hash = value_hash.hexdigest()
+
+        buckets[value_hash] += 1
+        values[value_hash] = val
+
+    return [(h, buckets[h], float(buckets[h]) / num_objects, values[h])
+            for h in sorted(buckets.keys(), key=lambda k: -buckets[k])]
+
+
+def optimizer(yaml):
+    original_size = len(syaml.dump_config(yaml, default_flow_style=True))
+
+    # try factoring out commonly repeated portions
+    common_job = {
+        'variables': {
+            'SPACK_COMPILER_ACTION': 'NONE',
+            'SPACK_RELATED_BUILDS_CDASH': ''
+        },
+
+        'after_script': ['rm -rf "./spack"'],
+
+        'artifacts': {
+            'paths': ['jobs_scratch_dir', 'cdash_report'],
+            'when': 'always'
+        },
+    }
+
+    # look for a list of tags that appear frequently
+    _, count, proportion, tags = next(iter(
+        build_histogram(yaml.values(), 'tags')),
+        (None,) * 4)
+
+    # If a list of tags is found, and there are more than one job that uses it,
+    # *and* the jobs that do use it represent at least 70% of all jobs, then
+    # add the list to the prototype object.
+    if tags and count > 1 and proportion >= 0.70:
+        common_job['tags'] = tags
+
+    # apply common object factorization
+    yaml, other, applied, rest = try_optimization_pass(
+        'general common object factorization',
+        yaml, common_subobject, common_job)
+
+    # look for a common script, and try factoring that out
+    _, count, proportion, script = next(iter(
+        build_histogram(yaml.values(), 'script')),
+        (None,) * 4)
+
+    if script and count > 1 and proportion >= 0.70:
+        yaml, other, applied, rest = try_optimization_pass(
+            'script factorization',
+            yaml, common_subobject, {'script': script})
+
+    # look for a common before_script, and try factoring that out
+    _, count, proportion, script = next(iter(
+        build_histogram(yaml.values(), 'before_script')),
+        (None,) * 4)
+
+    if script and count > 1 and proportion >= 0.70:
+        yaml, other, applied, rest = try_optimization_pass(
+            'before_script factorization',
+            yaml, common_subobject, {'before_script': script})
+
+    # Look specifically for the SPACK_ROOT_SPEC environment variables.
+    # Try to factor them out.
+    h = build_histogram((
+        getattr(val, 'get', lambda *args: {})('variables')
+        for val in yaml.values()), 'SPACK_ROOT_SPEC')
+
+    # In this case, we try to factor out *all* instances of the SPACK_ROOT_SPEC
+    # environment variable; not just the one that appears with the greatest
+    # frequency. We only require that more than 1 job uses a given instance's
+    # value, because we expect the value to be very large, and so expect even
+    # few-to-one factorizations to yield large space savings.
+    counter = 0
+    for _, count, proportion, spec in h:
+        if count <= 1:
+            continue
+
+        counter += 1
+
+        yaml, other, applied, rest = try_optimization_pass(
+            'SPACK_ROOT_SPEC factorization ({count})'.format(count=counter),
+            yaml,
+            common_subobject,
+            {'variables': {'SPACK_ROOT_SPEC': spec}})
+
+    new_size = len(syaml.dump_config(yaml, default_flow_style=True))
+
+    print('\n')
+    print_delta('overall summary', original_size, new_size)
+    print('\n')
+    return yaml
diff --git a/lib/spack/spack/cmd/ci.py b/lib/spack/spack/cmd/ci.py
index 35891edb13..3e57c6656a 100644
--- a/lib/spack/spack/cmd/ci.py
+++ b/lib/spack/spack/cmd/ci.py
@@ -54,6 +54,11 @@ def setup_parser(subparser):
         help="Provide a git branch or tag if a custom spack branch " +
              "should be checked out as a step in each generated job.  " +
              "This argument is ignored if no --spack-repo is provided.")
+    generate.add_argument(
+        '--optimize', action='store_true',
+        help="(Experimental) run the generated document through a series of "
+             "optimization passes designed to reduce the size of the "
+             "generated file.")
     generate.set_defaults(func=ci_generate)
 
     # Check a spec against mirror. Rebuild, create buildcache and push to
@@ -75,6 +80,7 @@ def ci_generate(args):
     copy_yaml_to = args.copy_to
     spack_repo = args.spack_repo
     spack_ref = args.spack_ref
+    run_optimizer = args.optimize
 
     if not output_file:
         gen_ci_dir = os.getcwd()
@@ -86,7 +92,8 @@ def ci_generate(args):
 
     # Generate the jobs
     spack_ci.generate_gitlab_ci_yaml(
-        env, True, output_file, spack_repo, spack_ref)
+        env, True, output_file, spack_repo, spack_ref,
+        run_optimizer=run_optimizer)
 
     if copy_yaml_to:
         copy_to_dir = os.path.dirname(copy_yaml_to)
diff --git a/share/spack/spack-completion.bash b/share/spack/spack-completion.bash
index 0a74d14488..ed42ad4edd 100755
--- a/share/spack/spack-completion.bash
+++ b/share/spack/spack-completion.bash
@@ -474,7 +474,7 @@ _spack_ci() {
 }
 
 _spack_ci_generate() {
-    SPACK_COMPREPLY="-h --help --output-file --copy-to --spack-repo --spack-ref"
+    SPACK_COMPREPLY="-h --help --output-file --copy-to --spack-repo --spack-ref --optimize"
 }
 
 _spack_ci_rebuild() {
-- 
cgit v1.2.3-60-g2f50