summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWouter Deconinck <wdconinc@gmail.com>2023-04-13 13:26:26 -0500
committerGitHub <noreply@github.com>2023-04-13 20:26:26 +0200
commitff319e98635fb952f1c26203c9041641f41687de (patch)
treec1f89d8c2eb986e2f346bf3061529f6139b73e8a
parentd918ae0bde903a2041a4c9d9edef5c59c63925cf (diff)
downloadspack-ff319e98635fb952f1c26203c9041641f41687de.tar.gz
spack-ff319e98635fb952f1c26203c9041641f41687de.tar.bz2
spack-ff319e98635fb952f1c26203c9041641f41687de.tar.xz
spack-ff319e98635fb952f1c26203c9041641f41687de.zip
Resolve `<include-fragment>` tags e.g. in github release pages (#36674)
This aims to resolve #34164 by resolving the <include-fragment> tags that GitHub has started using for their release pages, see https://github.github.io/include-fragment-element/. This feels a bit hacky but intended as a starting point for discussion. After reading a page during spidering, it first parses for include-fragments, gets them all, and treats them all as separate pages. Then it looks for href links in both the page itself and the fragments. Co-authored-by: Alec Scott <alec@bcs.sh>
-rw-r--r--lib/spack/spack/test/data/web/fragment.html1
-rw-r--r--lib/spack/spack/test/data/web/index_with_fragment.html13
-rw-r--r--lib/spack/spack/test/web.py10
-rw-r--r--lib/spack/spack/util/web.py48
4 files changed, 70 insertions, 2 deletions
diff --git a/lib/spack/spack/test/data/web/fragment.html b/lib/spack/spack/test/data/web/fragment.html
new file mode 100644
index 0000000000..6442dff95e
--- /dev/null
+++ b/lib/spack/spack/test/data/web/fragment.html
@@ -0,0 +1 @@
+<a href="foo-5.0.0.tar.gz">foo-5.0.0.tar.gz</a>
diff --git a/lib/spack/spack/test/data/web/index_with_fragment.html b/lib/spack/spack/test/data/web/index_with_fragment.html
new file mode 100644
index 0000000000..6f5e4e0dcf
--- /dev/null
+++ b/lib/spack/spack/test/data/web/index_with_fragment.html
@@ -0,0 +1,13 @@
+<html>
+ <head>
+ This is the root page.
+ </head>
+ <body>
+ This is a page with an include-fragment element.
+
+ <script type="module" src="https://unpkg.com/@github/include-fragment-element@latest?module"></script>
+ <include-fragment src="fragment.html">
+ <p>Loading...</p>
+ </include-fragment>
+ </body>
+</html>
diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py
index 4cc5277cd0..7f6c12a5e2 100644
--- a/lib/spack/spack/test/web.py
+++ b/lib/spack/spack/test/web.py
@@ -31,6 +31,8 @@ page_2 = _create_url("2.html")
page_3 = _create_url("3.html")
page_4 = _create_url("4.html")
+root_with_fragment = _create_url("index_with_fragment.html")
+
@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)")
@pytest.mark.parametrize(
@@ -143,6 +145,14 @@ def test_find_exotic_versions_of_archive_3():
assert ver("4.5-rc5") in versions
+@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)")
+def test_find_versions_of_archive_with_fragment():
+ versions = spack.util.web.find_versions_of_archive(
+ root_tarball, root_with_fragment, list_depth=0
+ )
+ assert ver("5.0.0") in versions
+
+
def test_get_header():
headers = {"Content-type": "text/plain"}
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index a57e55649f..6101ffec3f 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -75,7 +75,7 @@ class LinkParser(HTMLParser):
links. Good enough for a really simple spider."""
def __init__(self):
- HTMLParser.__init__(self)
+ super().__init__()
self.links = []
def handle_starttag(self, tag, attrs):
@@ -85,6 +85,21 @@ class LinkParser(HTMLParser):
self.links.append(val)
+class IncludeFragmentParser(HTMLParser):
+ """This parser takes an HTML page and selects the include-fragments,
+ used on GitHub, https://github.github.io/include-fragment-element."""
+
+ def __init__(self):
+ super().__init__()
+ self.links = []
+
+ def handle_starttag(self, tag, attrs):
+ if tag == "include-fragment":
+ for attr, val in attrs:
+ if attr == "src":
+ self.links.append(val)
+
+
def read_from_url(url, accept_content_type=None):
if isinstance(url, str):
url = urllib.parse.urlparse(url)
@@ -550,9 +565,38 @@ def spider(root_urls, depth=0, concurrency=32):
page = codecs.getreader("utf-8")(response).read()
pages[response_url] = page
- # Parse out the links in the page
+ # Parse out the include-fragments in the page
+ # https://github.github.io/include-fragment-element
+ include_fragment_parser = IncludeFragmentParser()
+ include_fragment_parser.feed(page)
+
+ fragments = set()
+ while include_fragment_parser.links:
+ raw_link = include_fragment_parser.links.pop()
+ abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True)
+
+ try:
+ # This seems to be text/html, though text/fragment+html is also used
+ fragment_response_url, _, fragment_response = read_from_url(
+ abs_link, "text/html"
+ )
+ except Exception as e:
+ msg = f"Error reading fragment: {(type(e), str(e))}:{traceback.format_exc()}"
+ tty.debug(msg)
+
+ if not fragment_response_url or not fragment_response:
+ continue
+
+ fragment = codecs.getreader("utf-8")(fragment_response).read()
+ fragments.add(fragment)
+
+ pages[fragment_response_url] = fragment
+
+ # Parse out the links in the page and all fragments
link_parser = LinkParser()
link_parser.feed(page)
+ for fragment in fragments:
+ link_parser.feed(fragment)
while link_parser.links:
raw_link = link_parser.links.pop()