summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/spack/spack/util/web.py28
1 files changed, 20 insertions, 8 deletions
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index eca7bd72a2..57158db950 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -110,19 +110,28 @@ class LinkParser(HTMLParser):
self.links.append(val)
-class IncludeFragmentParser(HTMLParser):
+class ExtractMetadataParser(HTMLParser):
"""This parser takes an HTML page and selects the include-fragments,
- used on GitHub, https://github.github.io/include-fragment-element."""
+ used on GitHub, https://github.github.io/include-fragment-element,
+ as well as a possible base url."""
def __init__(self):
super().__init__()
- self.links = []
+ self.fragments = []
+ self.base_url = None
def handle_starttag(self, tag, attrs):
+ # <include-fragment src="..." />
if tag == "include-fragment":
for attr, val in attrs:
if attr == "src":
- self.links.append(val)
+ self.fragments.append(val)
+
+ # <base href="..." />
+ elif tag == "base":
+ for attr, val in attrs:
+ if attr == "href":
+ self.base_url = val
def read_from_url(url, accept_content_type=None):
@@ -625,12 +634,15 @@ def _spider(url: urllib.parse.ParseResult, collect_nested: bool, _visited: Set[s
# Parse out the include-fragments in the page
# https://github.github.io/include-fragment-element
- include_fragment_parser = IncludeFragmentParser()
- include_fragment_parser.feed(page)
+ metadata_parser = ExtractMetadataParser()
+ metadata_parser.feed(page)
+
+ # Change of base URL due to <base href="..." /> tag
+ response_url = metadata_parser.base_url or response_url
fragments = set()
- while include_fragment_parser.links:
- raw_link = include_fragment_parser.links.pop()
+ while metadata_parser.fragments:
+ raw_link = metadata_parser.fragments.pop()
abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True)
try: