diff options
author | Wouter Deconinck <wdconinc@gmail.com> | 2024-08-17 02:02:03 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-08-17 09:02:03 +0200 |
commit | 553cc3b70a4c126e612145e3e48acff321cedecb (patch) | |
tree | 4194269811bc74e14eac698e51747ff86da57920 /lib | |
parent | f0f9a16e4fc4e5e7dcdfaa36a44ec0ab66779532 (diff) | |
download | spack-553cc3b70a4c126e612145e3e48acff321cedecb.tar.gz spack-553cc3b70a4c126e612145e3e48acff321cedecb.tar.bz2 spack-553cc3b70a4c126e612145e3e48acff321cedecb.tar.xz spack-553cc3b70a4c126e612145e3e48acff321cedecb.zip |
util/web.py: parse new GitLab JS dropdown links (#45764)
Co-authored-by: Harmen Stoppels <harmenstoppels@gmail.com>
Diffstat (limited to 'lib')
-rw-r--r-- | lib/spack/spack/test/data/web/index_with_javascript.html | 10 | ||||
-rw-r--r-- | lib/spack/spack/test/web.py | 6 | ||||
-rw-r--r-- | lib/spack/spack/util/web.py | 19 |
3 files changed, 31 insertions, 4 deletions
diff --git a/lib/spack/spack/test/data/web/index_with_javascript.html b/lib/spack/spack/test/data/web/index_with_javascript.html new file mode 100644 index 0000000000..89882ba431 --- /dev/null +++ b/lib/spack/spack/test/data/web/index_with_javascript.html @@ -0,0 +1,10 @@ +<html> + <head> + This is the root page. + </head> + <body> + This is a page with a Vue javascript drop down with links as used in GitLab. + + <div class="js-source-code-dropdown" data-css-class="" data-download-artifacts="[]" data-download-links="[{"text":"tar.gz","path":"/foo-5.0.0.tar.gz"}]"></div> + </body> +</html> diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py index cf89e2e3a4..1ae76c6e8d 100644 --- a/lib/spack/spack/test/web.py +++ b/lib/spack/spack/test/web.py @@ -37,6 +37,7 @@ page_3 = _create_url("3.html") page_4 = _create_url("4.html") root_with_fragment = _create_url("index_with_fragment.html") +root_with_javascript = _create_url("index_with_javascript.html") @pytest.mark.parametrize( @@ -148,6 +149,11 @@ def test_find_versions_of_archive_with_fragment(): assert Version("5.0.0") in versions +def test_find_versions_of_archive_with_javascript(): + versions = spack.url.find_versions_of_archive(root_tarball, root_with_javascript, list_depth=0) + assert Version("5.0.0") in versions + + def test_get_header(): headers = {"Content-type": "text/plain"} diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index 6b27c6ae68..9a0f1d6e4b 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -7,6 +7,7 @@ import codecs import concurrent.futures import email.message import errno +import json import os import os.path import re @@ -152,7 +153,8 @@ class HTMLParseError(Exception): class LinkParser(HTMLParser): """This parser just takes an HTML page and strips out the hrefs on the - links. Good enough for a really simple spider.""" + links, as well as some javascript tags used on GitLab servers. + Good enough for a really simple spider.""" def __init__(self): super().__init__() @@ -160,9 +162,18 @@ class LinkParser(HTMLParser): def handle_starttag(self, tag, attrs): if tag == "a": - for attr, val in attrs: - if attr == "href": - self.links.append(val) + self.links.extend(val for key, val in attrs if key == "href") + + # GitLab uses a javascript function to place dropdown links: + # <div class="js-source-code-dropdown" ... + # data-download-links="[{"path":"/graphviz/graphviz/-/archive/12.0.0/graphviz-12.0.0.zip",...},...]"/> + if tag == "div" and ("class", "js-source-code-dropdown") in attrs: + try: + links_str = next(val for key, val in attrs if key == "data-download-links") + links = json.loads(links_str) + self.links.extend(x["path"] for x in links) + except Exception: + pass class ExtractMetadataParser(HTMLParser): |