From 65e7ec0509a2191b0a323730e280bba8bd30395b Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Wed, 11 Oct 2023 17:49:50 +0200 Subject: spider: respect tag (#40443) --- lib/spack/spack/util/web.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index eca7bd72a2..57158db950 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -110,19 +110,28 @@ class LinkParser(HTMLParser): self.links.append(val) -class IncludeFragmentParser(HTMLParser): +class ExtractMetadataParser(HTMLParser): """This parser takes an HTML page and selects the include-fragments, - used on GitHub, https://github.github.io/include-fragment-element.""" + used on GitHub, https://github.github.io/include-fragment-element, + as well as a possible base url.""" def __init__(self): super().__init__() - self.links = [] + self.fragments = [] + self.base_url = None def handle_starttag(self, tag, attrs): + # if tag == "include-fragment": for attr, val in attrs: if attr == "src": - self.links.append(val) + self.fragments.append(val) + + # + elif tag == "base": + for attr, val in attrs: + if attr == "href": + self.base_url = val def read_from_url(url, accept_content_type=None): @@ -625,12 +634,15 @@ def _spider(url: urllib.parse.ParseResult, collect_nested: bool, _visited: Set[s # Parse out the include-fragments in the page # https://github.github.io/include-fragment-element - include_fragment_parser = IncludeFragmentParser() - include_fragment_parser.feed(page) + metadata_parser = ExtractMetadataParser() + metadata_parser.feed(page) + + # Change of base URL due to tag + response_url = metadata_parser.base_url or response_url fragments = set() - while include_fragment_parser.links: - raw_link = include_fragment_parser.links.pop() + while metadata_parser.fragments: + raw_link = metadata_parser.fragments.pop() abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True) try: -- cgit v1.2.3-60-g2f50