diff options
author | Wouter Deconinck <wdconinc@gmail.com> | 2023-04-13 13:26:26 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-04-13 20:26:26 +0200 |
commit | ff319e98635fb952f1c26203c9041641f41687de (patch) | |
tree | c1f89d8c2eb986e2f346bf3061529f6139b73e8a /lib | |
parent | d918ae0bde903a2041a4c9d9edef5c59c63925cf (diff) | |
download | spack-ff319e98635fb952f1c26203c9041641f41687de.tar.gz spack-ff319e98635fb952f1c26203c9041641f41687de.tar.bz2 spack-ff319e98635fb952f1c26203c9041641f41687de.tar.xz spack-ff319e98635fb952f1c26203c9041641f41687de.zip |
Resolve `<include-fragment>` tags e.g. in github release pages (#36674)
This aims to resolve #34164 by resolving the <include-fragment> tags
that GitHub has started using for their release pages, see
https://github.github.io/include-fragment-element/.
This feels a bit hacky but intended as a starting point for discussion.
After reading a page during spidering, it first parses for
include-fragments, gets them all, and treats them all as separate pages.
Then it looks for href links in both the page itself and the fragments.
Co-authored-by: Alec Scott <alec@bcs.sh>
Diffstat (limited to 'lib')
-rw-r--r-- | lib/spack/spack/test/data/web/fragment.html | 1 | ||||
-rw-r--r-- | lib/spack/spack/test/data/web/index_with_fragment.html | 13 | ||||
-rw-r--r-- | lib/spack/spack/test/web.py | 10 | ||||
-rw-r--r-- | lib/spack/spack/util/web.py | 48 |
4 files changed, 70 insertions, 2 deletions
diff --git a/lib/spack/spack/test/data/web/fragment.html b/lib/spack/spack/test/data/web/fragment.html new file mode 100644 index 0000000000..6442dff95e --- /dev/null +++ b/lib/spack/spack/test/data/web/fragment.html @@ -0,0 +1 @@ +<a href="foo-5.0.0.tar.gz">foo-5.0.0.tar.gz</a> diff --git a/lib/spack/spack/test/data/web/index_with_fragment.html b/lib/spack/spack/test/data/web/index_with_fragment.html new file mode 100644 index 0000000000..6f5e4e0dcf --- /dev/null +++ b/lib/spack/spack/test/data/web/index_with_fragment.html @@ -0,0 +1,13 @@ +<html> + <head> + This is the root page. + </head> + <body> + This is a page with an include-fragment element. + + <script type="module" src="https://unpkg.com/@github/include-fragment-element@latest?module"></script> + <include-fragment src="fragment.html"> + <p>Loading...</p> + </include-fragment> + </body> +</html> diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py index 4cc5277cd0..7f6c12a5e2 100644 --- a/lib/spack/spack/test/web.py +++ b/lib/spack/spack/test/web.py @@ -31,6 +31,8 @@ page_2 = _create_url("2.html") page_3 = _create_url("3.html") page_4 = _create_url("4.html") +root_with_fragment = _create_url("index_with_fragment.html") + @pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)") @pytest.mark.parametrize( @@ -143,6 +145,14 @@ def test_find_exotic_versions_of_archive_3(): assert ver("4.5-rc5") in versions +@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)") +def test_find_versions_of_archive_with_fragment(): + versions = spack.util.web.find_versions_of_archive( + root_tarball, root_with_fragment, list_depth=0 + ) + assert ver("5.0.0") in versions + + def test_get_header(): headers = {"Content-type": "text/plain"} diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index a57e55649f..6101ffec3f 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -75,7 +75,7 @@ class LinkParser(HTMLParser): links. Good enough for a really simple spider.""" def __init__(self): - HTMLParser.__init__(self) + super().__init__() self.links = [] def handle_starttag(self, tag, attrs): @@ -85,6 +85,21 @@ class LinkParser(HTMLParser): self.links.append(val) +class IncludeFragmentParser(HTMLParser): + """This parser takes an HTML page and selects the include-fragments, + used on GitHub, https://github.github.io/include-fragment-element.""" + + def __init__(self): + super().__init__() + self.links = [] + + def handle_starttag(self, tag, attrs): + if tag == "include-fragment": + for attr, val in attrs: + if attr == "src": + self.links.append(val) + + def read_from_url(url, accept_content_type=None): if isinstance(url, str): url = urllib.parse.urlparse(url) @@ -550,9 +565,38 @@ def spider(root_urls, depth=0, concurrency=32): page = codecs.getreader("utf-8")(response).read() pages[response_url] = page - # Parse out the links in the page + # Parse out the include-fragments in the page + # https://github.github.io/include-fragment-element + include_fragment_parser = IncludeFragmentParser() + include_fragment_parser.feed(page) + + fragments = set() + while include_fragment_parser.links: + raw_link = include_fragment_parser.links.pop() + abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True) + + try: + # This seems to be text/html, though text/fragment+html is also used + fragment_response_url, _, fragment_response = read_from_url( + abs_link, "text/html" + ) + except Exception as e: + msg = f"Error reading fragment: {(type(e), str(e))}:{traceback.format_exc()}" + tty.debug(msg) + + if not fragment_response_url or not fragment_response: + continue + + fragment = codecs.getreader("utf-8")(fragment_response).read() + fragments.add(fragment) + + pages[fragment_response_url] = fragment + + # Parse out the links in the page and all fragments link_parser = LinkParser() link_parser.feed(page) + for fragment in fragments: + link_parser.feed(fragment) while link_parser.links: raw_link = link_parser.links.pop() |