summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHarmen Stoppels <harmenstoppels@gmail.com>2023-10-11 17:49:50 +0200
committerGitHub <noreply@github.com>2023-10-11 08:49:50 -0700
commit65e7ec0509a2191b0a323730e280bba8bd30395b (patch)
treec6f6d161be7a79b18eb2b0e34032fe82633c8a87
parent1ab88866952e8dc73be7c93dcdcc3cb5965fda10 (diff)
downloadspack-65e7ec0509a2191b0a323730e280bba8bd30395b.tar.gz
spack-65e7ec0509a2191b0a323730e280bba8bd30395b.tar.bz2
spack-65e7ec0509a2191b0a323730e280bba8bd30395b.tar.xz
spack-65e7ec0509a2191b0a323730e280bba8bd30395b.zip
spider: respect <base> tag (#40443)
-rw-r--r--lib/spack/spack/util/web.py28
1 files changed, 20 insertions, 8 deletions
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index eca7bd72a2..57158db950 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -110,19 +110,28 @@ class LinkParser(HTMLParser):
self.links.append(val)
-class IncludeFragmentParser(HTMLParser):
+class ExtractMetadataParser(HTMLParser):
"""This parser takes an HTML page and selects the include-fragments,
- used on GitHub, https://github.github.io/include-fragment-element."""
+ used on GitHub, https://github.github.io/include-fragment-element,
+ as well as a possible base url."""
def __init__(self):
super().__init__()
- self.links = []
+ self.fragments = []
+ self.base_url = None
def handle_starttag(self, tag, attrs):
+ # <include-fragment src="..." />
if tag == "include-fragment":
for attr, val in attrs:
if attr == "src":
- self.links.append(val)
+ self.fragments.append(val)
+
+ # <base href="..." />
+ elif tag == "base":
+ for attr, val in attrs:
+ if attr == "href":
+ self.base_url = val
def read_from_url(url, accept_content_type=None):
@@ -625,12 +634,15 @@ def _spider(url: urllib.parse.ParseResult, collect_nested: bool, _visited: Set[s
# Parse out the include-fragments in the page
# https://github.github.io/include-fragment-element
- include_fragment_parser = IncludeFragmentParser()
- include_fragment_parser.feed(page)
+ metadata_parser = ExtractMetadataParser()
+ metadata_parser.feed(page)
+
+ # Change of base URL due to <base href="..." /> tag
+ response_url = metadata_parser.base_url or response_url
fragments = set()
- while include_fragment_parser.links:
- raw_link = include_fragment_parser.links.pop()
+ while metadata_parser.fragments:
+ raw_link = metadata_parser.fragments.pop()
abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True)
try: