From ff319e98635fb952f1c26203c9041641f41687de Mon Sep 17 00:00:00 2001 From: Wouter Deconinck Date: Thu, 13 Apr 2023 13:26:26 -0500 Subject: Resolve `` tags e.g. in github release pages (#36674) This aims to resolve #34164 by resolving the tags that GitHub has started using for their release pages, see https://github.github.io/include-fragment-element/. This feels a bit hacky but intended as a starting point for discussion. After reading a page during spidering, it first parses for include-fragments, gets them all, and treats them all as separate pages. Then it looks for href links in both the page itself and the fragments. Co-authored-by: Alec Scott --- lib/spack/spack/test/data/web/fragment.html | 1 + .../spack/test/data/web/index_with_fragment.html | 13 ++++++ lib/spack/spack/test/web.py | 10 +++++ lib/spack/spack/util/web.py | 48 +++++++++++++++++++++- 4 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 lib/spack/spack/test/data/web/fragment.html create mode 100644 lib/spack/spack/test/data/web/index_with_fragment.html diff --git a/lib/spack/spack/test/data/web/fragment.html b/lib/spack/spack/test/data/web/fragment.html new file mode 100644 index 0000000000..6442dff95e --- /dev/null +++ b/lib/spack/spack/test/data/web/fragment.html @@ -0,0 +1 @@ +foo-5.0.0.tar.gz diff --git a/lib/spack/spack/test/data/web/index_with_fragment.html b/lib/spack/spack/test/data/web/index_with_fragment.html new file mode 100644 index 0000000000..6f5e4e0dcf --- /dev/null +++ b/lib/spack/spack/test/data/web/index_with_fragment.html @@ -0,0 +1,13 @@ + + + This is the root page. + + + This is a page with an include-fragment element. + + + +

+ + + diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py index 4cc5277cd0..7f6c12a5e2 100644 --- a/lib/spack/spack/test/web.py +++ b/lib/spack/spack/test/web.py @@ -31,6 +31,8 @@ page_2 = _create_url("2.html") page_3 = _create_url("3.html") page_4 = _create_url("4.html") +root_with_fragment = _create_url("index_with_fragment.html") + @pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)") @pytest.mark.parametrize( @@ -143,6 +145,14 @@ def test_find_exotic_versions_of_archive_3(): assert ver("4.5-rc5") in versions +@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)") +def test_find_versions_of_archive_with_fragment(): + versions = spack.util.web.find_versions_of_archive( + root_tarball, root_with_fragment, list_depth=0 + ) + assert ver("5.0.0") in versions + + def test_get_header(): headers = {"Content-type": "text/plain"} diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index a57e55649f..6101ffec3f 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -75,7 +75,7 @@ class LinkParser(HTMLParser): links. Good enough for a really simple spider.""" def __init__(self): - HTMLParser.__init__(self) + super().__init__() self.links = [] def handle_starttag(self, tag, attrs): @@ -85,6 +85,21 @@ class LinkParser(HTMLParser): self.links.append(val) +class IncludeFragmentParser(HTMLParser): + """This parser takes an HTML page and selects the include-fragments, + used on GitHub, https://github.github.io/include-fragment-element.""" + + def __init__(self): + super().__init__() + self.links = [] + + def handle_starttag(self, tag, attrs): + if tag == "include-fragment": + for attr, val in attrs: + if attr == "src": + self.links.append(val) + + def read_from_url(url, accept_content_type=None): if isinstance(url, str): url = urllib.parse.urlparse(url) @@ -550,9 +565,38 @@ def spider(root_urls, depth=0, concurrency=32): page = codecs.getreader("utf-8")(response).read() pages[response_url] = page - # Parse out the links in the page + # Parse out the include-fragments in the page + # https://github.github.io/include-fragment-element + include_fragment_parser = IncludeFragmentParser() + include_fragment_parser.feed(page) + + fragments = set() + while include_fragment_parser.links: + raw_link = include_fragment_parser.links.pop() + abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True) + + try: + # This seems to be text/html, though text/fragment+html is also used + fragment_response_url, _, fragment_response = read_from_url( + abs_link, "text/html" + ) + except Exception as e: + msg = f"Error reading fragment: {(type(e), str(e))}:{traceback.format_exc()}" + tty.debug(msg) + + if not fragment_response_url or not fragment_response: + continue + + fragment = codecs.getreader("utf-8")(fragment_response).read() + fragments.add(fragment) + + pages[fragment_response_url] = fragment + + # Parse out the links in the page and all fragments link_parser = LinkParser() link_parser.feed(page) + for fragment in fragments: + link_parser.feed(fragment) while link_parser.links: raw_link = link_parser.links.pop() -- cgit v1.2.3-70-g09d2