summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorWouter Deconinck <wdconinc@gmail.com>2024-08-17 02:02:03 -0500
committerGitHub <noreply@github.com>2024-08-17 09:02:03 +0200
commit553cc3b70a4c126e612145e3e48acff321cedecb (patch)
tree4194269811bc74e14eac698e51747ff86da57920 /lib
parentf0f9a16e4fc4e5e7dcdfaa36a44ec0ab66779532 (diff)
downloadspack-553cc3b70a4c126e612145e3e48acff321cedecb.tar.gz
spack-553cc3b70a4c126e612145e3e48acff321cedecb.tar.bz2
spack-553cc3b70a4c126e612145e3e48acff321cedecb.tar.xz
spack-553cc3b70a4c126e612145e3e48acff321cedecb.zip
util/web.py: parse new GitLab JS dropdown links (#45764)
Co-authored-by: Harmen Stoppels <harmenstoppels@gmail.com>
Diffstat (limited to 'lib')
-rw-r--r--lib/spack/spack/test/data/web/index_with_javascript.html10
-rw-r--r--lib/spack/spack/test/web.py6
-rw-r--r--lib/spack/spack/util/web.py19
3 files changed, 31 insertions, 4 deletions
diff --git a/lib/spack/spack/test/data/web/index_with_javascript.html b/lib/spack/spack/test/data/web/index_with_javascript.html
new file mode 100644
index 0000000000..89882ba431
--- /dev/null
+++ b/lib/spack/spack/test/data/web/index_with_javascript.html
@@ -0,0 +1,10 @@
+<html>
+ <head>
+ This is the root page.
+ </head>
+ <body>
+ This is a page with a Vue javascript drop down with links as used in GitLab.
+
+ <div class="js-source-code-dropdown" data-css-class="" data-download-artifacts="[]" data-download-links="[{&quot;text&quot;:&quot;tar.gz&quot;,&quot;path&quot;:&quot;/foo-5.0.0.tar.gz&quot;}]"></div>
+ </body>
+</html>
diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py
index cf89e2e3a4..1ae76c6e8d 100644
--- a/lib/spack/spack/test/web.py
+++ b/lib/spack/spack/test/web.py
@@ -37,6 +37,7 @@ page_3 = _create_url("3.html")
page_4 = _create_url("4.html")
root_with_fragment = _create_url("index_with_fragment.html")
+root_with_javascript = _create_url("index_with_javascript.html")
@pytest.mark.parametrize(
@@ -148,6 +149,11 @@ def test_find_versions_of_archive_with_fragment():
assert Version("5.0.0") in versions
+def test_find_versions_of_archive_with_javascript():
+ versions = spack.url.find_versions_of_archive(root_tarball, root_with_javascript, list_depth=0)
+ assert Version("5.0.0") in versions
+
+
def test_get_header():
headers = {"Content-type": "text/plain"}
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index 6b27c6ae68..9a0f1d6e4b 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -7,6 +7,7 @@ import codecs
import concurrent.futures
import email.message
import errno
+import json
import os
import os.path
import re
@@ -152,7 +153,8 @@ class HTMLParseError(Exception):
class LinkParser(HTMLParser):
"""This parser just takes an HTML page and strips out the hrefs on the
- links. Good enough for a really simple spider."""
+ links, as well as some javascript tags used on GitLab servers.
+ Good enough for a really simple spider."""
def __init__(self):
super().__init__()
@@ -160,9 +162,18 @@ class LinkParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == "a":
- for attr, val in attrs:
- if attr == "href":
- self.links.append(val)
+ self.links.extend(val for key, val in attrs if key == "href")
+
+ # GitLab uses a javascript function to place dropdown links:
+ # <div class="js-source-code-dropdown" ...
+ # data-download-links="[{"path":"/graphviz/graphviz/-/archive/12.0.0/graphviz-12.0.0.zip",...},...]"/>
+ if tag == "div" and ("class", "js-source-code-dropdown") in attrs:
+ try:
+ links_str = next(val for key, val in attrs if key == "data-download-links")
+ links = json.loads(links_str)
+ self.links.extend(x["path"] for x in links)
+ except Exception:
+ pass
class ExtractMetadataParser(HTMLParser):