summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTodd Gamblin <tgamblin@llnl.gov>2015-12-23 14:06:37 -0800
committerTodd Gamblin <tgamblin@llnl.gov>2015-12-23 14:06:37 -0800
commit28d61f0d7f1569e643fd79fa9c31c21c2e6ec4e4 (patch)
tree71d7987a2ec35b898ecff945e9840fc51a00ab02
parentd63cb8b537a9fc12a2a1ee5b22f2b20e19d90dc1 (diff)
parent5ca5884ad6285fd766a2f704bceb40b1cf63750f (diff)
downloadspack-28d61f0d7f1569e643fd79fa9c31c21c2e6ec4e4.tar.gz
spack-28d61f0d7f1569e643fd79fa9c31c21c2e6ec4e4.tar.bz2
spack-28d61f0d7f1569e643fd79fa9c31c21c2e6ec4e4.tar.xz
spack-28d61f0d7f1569e643fd79fa9c31c21c2e6ec4e4.zip
Merge pull request #276 from LLNL/bugfix/235-smarter-spider
Fix #235: Smarter web spidering -- use parsed links instead of recons…
-rw-r--r--lib/spack/spack/cmd/create.py4
-rw-r--r--lib/spack/spack/cmd/url-parse.py75
-rw-r--r--lib/spack/spack/package.py46
-rw-r--r--lib/spack/spack/util/web.py140
4 files changed, 183 insertions, 82 deletions
diff --git a/lib/spack/spack/cmd/create.py b/lib/spack/spack/cmd/create.py
index 5e42860f3e..9ecb709110 100644
--- a/lib/spack/spack/cmd/create.py
+++ b/lib/spack/spack/cmd/create.py
@@ -34,8 +34,8 @@ from llnl.util.filesystem import mkdirp
import spack
import spack.cmd
import spack.cmd.checksum
-import spack.package
import spack.url
+import spack.util.web
from spack.util.naming import *
import spack.util.crypto as crypto
@@ -166,7 +166,7 @@ def create(parser, args):
tty.msg("This looks like a URL for %s version %s." % (name, version))
tty.msg("Creating template for package %s" % name)
- versions = spack.package.find_versions_of_archive(url)
+ versions = spack.util.web.find_versions_of_archive(url)
rkeys = sorted(versions.keys(), reverse=True)
versions = OrderedDict(zip(rkeys, (versions[v] for v in rkeys)))
diff --git a/lib/spack/spack/cmd/url-parse.py b/lib/spack/spack/cmd/url-parse.py
new file mode 100644
index 0000000000..077c793d2e
--- /dev/null
+++ b/lib/spack/spack/cmd/url-parse.py
@@ -0,0 +1,75 @@
+##############################################################################
+# Copyright (c) 2013, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+#
+# This file is part of Spack.
+# Written by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
+# LLNL-CODE-647188
+#
+# For details, see https://github.com/llnl/spack
+# Please also see the LICENSE file for our notice and the LGPL.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License (as published by
+# the Free Software Foundation) version 2.1 dated February 1999.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
+# conditions of the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+##############################################################################
+import sys
+
+import llnl.util.tty as tty
+
+import spack
+import spack.url
+from spack.util.web import find_versions_of_archive
+
+description = "Show parsing of a URL, optionally spider web for other versions."
+
+def setup_parser(subparser):
+ subparser.add_argument('url', help="url of a package archive")
+ subparser.add_argument(
+ '-s', '--spider', action='store_true', help="Spider the source page for versions.")
+
+
+def print_name_and_version(url):
+ name, ns, nl, ntup, ver, vs, vl, vtup = spack.url.substitution_offsets(url)
+ underlines = [" "] * max(ns+nl, vs+vl)
+ for i in range(ns, ns+nl):
+ underlines[i] = '-'
+ for i in range(vs, vs+vl):
+ underlines[i] = '~'
+
+ print " %s" % url
+ print " %s" % ''.join(underlines)
+
+
+def url_parse(parser, args):
+ url = args.url
+
+ ver, vs, vl = spack.url.parse_version_offset(url)
+ name, ns, nl = spack.url.parse_name_offset(url, ver)
+
+ tty.msg("Parsing URL:")
+ try:
+ print_name_and_version(url)
+ except spack.url.UrlParseError as e:
+ tty.error(str(e))
+
+ print
+ tty.msg("Substituting version 9.9.9b:")
+ newurl = spack.url.substitute_version(url, '9.9.9b')
+ print_name_and_version(newurl)
+
+ if args.spider:
+ print
+ tty.msg("Spidering for versions:")
+ versions = find_versions_of_archive(url)
+ for v in sorted(versions):
+ print "%-20s%s" % (v, versions[v])
diff --git a/lib/spack/spack/package.py b/lib/spack/spack/package.py
index 6673e4f392..b44554e418 100644
--- a/lib/spack/spack/package.py
+++ b/lib/spack/spack/package.py
@@ -1164,7 +1164,7 @@ class Package(object):
raise VersionFetchError(self.__class__)
try:
- return find_versions_of_archive(
+ return spack.util.web.find_versions_of_archive(
*self.all_urls, list_url=self.list_url, list_depth=self.list_depth)
except spack.error.NoNetworkConnectionError, e:
tty.die("Package.fetch_versions couldn't connect to:",
@@ -1188,50 +1188,6 @@ class Package(object):
return " ".join("-Wl,-rpath=%s" % p for p in self.rpath)
-def find_versions_of_archive(*archive_urls, **kwargs):
- list_url = kwargs.get('list_url', None)
- list_depth = kwargs.get('list_depth', 1)
-
- # Generate a list of list_urls based on archive urls and any
- # explicitly listed list_url in the package
- list_urls = set()
- if list_url:
- list_urls.add(list_url)
- for aurl in archive_urls:
- list_urls.add(spack.url.find_list_url(aurl))
-
- # Grab some web pages to scrape.
- page_map = {}
- for lurl in list_urls:
- pages = spack.util.web.get_pages(lurl, depth=list_depth)
- page_map.update(pages)
-
- # Scrape them for archive URLs
- regexes = []
- for aurl in archive_urls:
- # This creates a regex from the URL with a capture group for
- # the version part of the URL. The capture group is converted
- # to a generic wildcard, so we can use this to extract things
- # on a page that look like archive URLs.
- url_regex = spack.url.wildcard_version(aurl)
-
- # We'll be a bit more liberal and just look for the archive
- # part, not the full path.
- regexes.append(os.path.basename(url_regex))
-
- # Build a version list from all the matches we find
- versions = {}
- for page_url, content in page_map.iteritems():
- # extract versions from matches.
- for regex in regexes:
- for m in re.finditer(regex, content):
- url = urljoin(page_url, m.group(0))
- ver = spack.url.parse_version(url)
- versions[ver] = url
-
- return versions
-
-
def validate_package_url(url_string):
"""Determine whether spack can handle a particular URL or not."""
url = urlparse(url_string)
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index 94384e9c86..e26daef296 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -23,6 +23,7 @@
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
##############################################################################
import re
+import os
import sys
import subprocess
import urllib2, cookielib
@@ -70,7 +71,9 @@ def _spider(args):
"""
url, visited, root, opener, depth, max_depth, raise_on_error = args
- pages = {}
+ pages = {} # dict from page URL -> text content.
+ links = set() # set of all links seen on visited pages.
+
try:
# Make a HEAD request first to check the content type. This lets
# us ignore tarballs and gigantic files.
@@ -99,42 +102,45 @@ def _spider(args):
page = response.read()
pages[response_url] = page
- # If we're not at max depth, parse out the links in the page
- if depth < max_depth:
- link_parser = LinkParser()
- subcalls = []
- link_parser.feed(page)
-
- while link_parser.links:
- raw_link = link_parser.links.pop()
+ # Parse out the links in the page
+ link_parser = LinkParser()
+ subcalls = []
+ link_parser.feed(page)
- # Skip stuff that looks like an archive
- if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
- continue
+ while link_parser.links:
+ raw_link = link_parser.links.pop()
+ abs_link = urlparse.urljoin(response_url, raw_link)
- # Evaluate the link relative to the page it came from.
- abs_link = urlparse.urljoin(response_url, raw_link)
+ links.add(abs_link)
- # Skip things outside the root directory
- if not abs_link.startswith(root):
- continue
+ # Skip stuff that looks like an archive
+ if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
+ continue
- # Skip already-visited links
- if abs_link in visited:
- continue
+ # Skip things outside the root directory
+ if not abs_link.startswith(root):
+ continue
- subcalls.append((abs_link, visited, root, None, depth+1, max_depth, raise_on_error))
- visited.add(abs_link)
+ # Skip already-visited links
+ if abs_link in visited:
+ continue
- if subcalls:
- try:
- pool = Pool(processes=len(subcalls))
- dicts = pool.map(_spider, subcalls)
- for d in dicts:
- pages.update(d)
- finally:
- pool.terminate()
- pool.join()
+ # If we're not at max depth, follow links.
+ if depth < max_depth:
+ subcalls.append((abs_link, visited, root, None,
+ depth+1, max_depth, raise_on_error))
+ visited.add(abs_link)
+
+ if subcalls:
+ try:
+ pool = Pool(processes=len(subcalls))
+ results = pool.map(_spider, subcalls)
+ for sub_pages, sub_links in results:
+ pages.update(sub_pages)
+ links.update(sub_links)
+ finally:
+ pool.terminate()
+ pool.join()
except urllib2.URLError, e:
tty.debug(e)
@@ -155,10 +161,10 @@ def _spider(args):
# Other types of errors are completely ignored, except in debug mode.
tty.debug("Error in _spider: %s" % e)
- return pages
+ return pages, links
-def get_pages(root_url, **kwargs):
+def spider(root_url, **kwargs):
"""Gets web pages from a root URL.
If depth is specified (e.g., depth=2), then this will also fetches pages
linked from the root and its children up to depth.
@@ -167,5 +173,69 @@ def get_pages(root_url, **kwargs):
performance over a sequential fetch.
"""
max_depth = kwargs.setdefault('depth', 1)
- pages = _spider((root_url, set(), root_url, None, 1, max_depth, False))
- return pages
+ pages, links = _spider((root_url, set(), root_url, None, 1, max_depth, False))
+ return pages, links
+
+
+def find_versions_of_archive(*archive_urls, **kwargs):
+ """Scrape web pages for new versions of a tarball.
+
+ Arguments:
+ archive_urls:
+ URLs for different versions of a package. Typically these
+ are just the tarballs from the package file itself. By
+ default, this searches the parent directories of archives.
+
+ Keyword Arguments:
+ list_url:
+
+ URL for a listing of archives. Spack wills scrape these
+ pages for download links that look like the archive URL.
+
+ list_depth:
+ Max depth to follow links on list_url pages.
+
+ """
+ list_url = kwargs.get('list_url', None)
+ list_depth = kwargs.get('list_depth', 1)
+
+ # Generate a list of list_urls based on archive urls and any
+ # explicitly listed list_url in the package
+ list_urls = set()
+ if list_url:
+ list_urls.add(list_url)
+ for aurl in archive_urls:
+ list_urls.add(spack.url.find_list_url(aurl))
+
+ # Grab some web pages to scrape.
+ pages = {}
+ links = set()
+ for lurl in list_urls:
+ p, l = spider(lurl, depth=list_depth)
+ pages.update(p)
+ links.update(l)
+
+ # Scrape them for archive URLs
+ regexes = []
+ for aurl in archive_urls:
+ # This creates a regex from the URL with a capture group for
+ # the version part of the URL. The capture group is converted
+ # to a generic wildcard, so we can use this to extract things
+ # on a page that look like archive URLs.
+ url_regex = spack.url.wildcard_version(aurl)
+
+ # We'll be a bit more liberal and just look for the archive
+ # part, not the full path.
+ regexes.append(os.path.basename(url_regex))
+
+ # Build a dict version -> URL from any links that match the wildcards.
+ versions = {}
+ for url in links:
+ if any(re.search(r, url) for r in regexes):
+ try:
+ ver = spack.url.parse_version(url)
+ versions[ver] = url
+ except spack.url.UndetectableVersionError as e:
+ continue
+
+ return versions