diff options
author | Harmen Stoppels <harmenstoppels@gmail.com> | 2023-10-11 17:49:50 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-11 08:49:50 -0700 |
commit | 65e7ec0509a2191b0a323730e280bba8bd30395b (patch) | |
tree | c6f6d161be7a79b18eb2b0e34032fe82633c8a87 /lib | |
parent | 1ab88866952e8dc73be7c93dcdcc3cb5965fda10 (diff) | |
download | spack-65e7ec0509a2191b0a323730e280bba8bd30395b.tar.gz spack-65e7ec0509a2191b0a323730e280bba8bd30395b.tar.bz2 spack-65e7ec0509a2191b0a323730e280bba8bd30395b.tar.xz spack-65e7ec0509a2191b0a323730e280bba8bd30395b.zip |
spider: respect <base> tag (#40443)
Diffstat (limited to 'lib')
-rw-r--r-- | lib/spack/spack/util/web.py | 28 |
1 files changed, 20 insertions, 8 deletions
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index eca7bd72a2..57158db950 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -110,19 +110,28 @@ class LinkParser(HTMLParser): self.links.append(val) -class IncludeFragmentParser(HTMLParser): +class ExtractMetadataParser(HTMLParser): """This parser takes an HTML page and selects the include-fragments, - used on GitHub, https://github.github.io/include-fragment-element.""" + used on GitHub, https://github.github.io/include-fragment-element, + as well as a possible base url.""" def __init__(self): super().__init__() - self.links = [] + self.fragments = [] + self.base_url = None def handle_starttag(self, tag, attrs): + # <include-fragment src="..." /> if tag == "include-fragment": for attr, val in attrs: if attr == "src": - self.links.append(val) + self.fragments.append(val) + + # <base href="..." /> + elif tag == "base": + for attr, val in attrs: + if attr == "href": + self.base_url = val def read_from_url(url, accept_content_type=None): @@ -625,12 +634,15 @@ def _spider(url: urllib.parse.ParseResult, collect_nested: bool, _visited: Set[s # Parse out the include-fragments in the page # https://github.github.io/include-fragment-element - include_fragment_parser = IncludeFragmentParser() - include_fragment_parser.feed(page) + metadata_parser = ExtractMetadataParser() + metadata_parser.feed(page) + + # Change of base URL due to <base href="..." /> tag + response_url = metadata_parser.base_url or response_url fragments = set() - while include_fragment_parser.links: - raw_link = include_fragment_parser.links.pop() + while metadata_parser.fragments: + raw_link = metadata_parser.fragments.pop() abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True) try: |