summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTodd Gamblin <tgamblin@llnl.gov>2014-11-07 00:17:25 -0800
committerTodd Gamblin <tgamblin@llnl.gov>2014-11-07 00:17:25 -0800
commit55bf243f166e38451026813fa2bdc2c90263d1aa (patch)
tree70b7390b1423ea8dc2f8ff75753b329e1e227042
parentd78ece658b0b139604998886a788acc11e661b14 (diff)
downloadspack-55bf243f166e38451026813fa2bdc2c90263d1aa.tar.gz
spack-55bf243f166e38451026813fa2bdc2c90263d1aa.tar.bz2
spack-55bf243f166e38451026813fa2bdc2c90263d1aa.tar.xz
spack-55bf243f166e38451026813fa2bdc2c90263d1aa.zip
Improved website scraping.
-rwxr-xr-xbin/spack1
-rw-r--r--lib/spack/spack/cmd/checksum.py34
-rw-r--r--lib/spack/spack/cmd/create.py7
-rw-r--r--lib/spack/spack/cmd/versions.py20
-rw-r--r--lib/spack/spack/concretize.py2
-rw-r--r--lib/spack/spack/package.py119
-rw-r--r--lib/spack/spack/url.py6
-rw-r--r--lib/spack/spack/util/web.py40
8 files changed, 134 insertions, 95 deletions
diff --git a/bin/spack b/bin/spack
index 75874ca39e..9fbb65f349 100755
--- a/bin/spack
+++ b/bin/spack
@@ -113,4 +113,5 @@ except SpackError, e:
tty.die(e.message)
except KeyboardInterrupt:
+ sys.stderr.write('\n')
tty.die("Keyboard interrupt.")
diff --git a/lib/spack/spack/cmd/checksum.py b/lib/spack/spack/cmd/checksum.py
index f9218b9df1..3f2a9aa745 100644
--- a/lib/spack/spack/cmd/checksum.py
+++ b/lib/spack/spack/cmd/checksum.py
@@ -85,24 +85,24 @@ def checksum(parser, args):
pkg = spack.db.get(args.package)
# If the user asked for specific versions, use those.
- versions = [ver(v) for v in args.versions]
-
- if not all(type(v) == Version for v in versions):
- tty.die("Cannot generate checksums for version lists or " +
- "version ranges. Use unambiguous versions.")
-
- if not versions:
- versions = pkg.fetch_available_versions()
+ if args.versions:
+ versions = {}
+ for v in args.versions:
+ v = ver(v)
+ if not isinstance(v, Version):
+ tty.die("Cannot generate checksums for version lists or " +
+ "version ranges. Use unambiguous versions.")
+ versions[v] = pkg.url_for_version(v)
+ else:
+ versions = pkg.fetch_remote_versions()
if not versions:
- tty.die("Could not fetch any available versions for %s." % pkg.name)
-
- versions = list(reversed(sorted(versions)))
- urls = [pkg.url_for_version(v) for v in versions]
+ tty.die("Could not fetch any versions for %s." % pkg.name)
+ sorted_versions = list(reversed(sorted(versions)))
- tty.msg("Found %s versions of %s." % (len(urls), pkg.name),
+ tty.msg("Found %s versions of %s." % (len(versions), pkg.name),
*spack.cmd.elide_list(
- ["%-10s%s" % (v,u) for v, u in zip(versions, urls)]))
+ ["%-10s%s" % (v, versions[v]) for v in sorted_versions]))
print
archives_to_fetch = tty.get_number(
"How many would you like to checksum?", default=5, abort='q')
@@ -112,10 +112,12 @@ def checksum(parser, args):
return
version_hashes = get_checksums(
- versions[:archives_to_fetch], urls[:archives_to_fetch], keep_stage=args.keep_stage)
+ sorted_versions[:archives_to_fetch],
+ [versions[v] for v in sorted_versions[:archives_to_fetch]],
+ keep_stage=args.keep_stage)
if not version_hashes:
- tty.die("Could not fetch any available versions for %s." % pkg.name)
+ tty.die("Could not fetch any versions for %s." % pkg.name)
version_lines = [" version('%s', '%s')" % (v, h) for v, h in version_hashes]
tty.msg("Checksummed new versions of %s:" % pkg.name, *version_lines)
diff --git a/lib/spack/spack/cmd/create.py b/lib/spack/spack/cmd/create.py
index 7ac10285a4..1b9ad524c4 100644
--- a/lib/spack/spack/cmd/create.py
+++ b/lib/spack/spack/cmd/create.py
@@ -159,13 +159,12 @@ def create(parser, args):
else:
mkdirp(os.path.dirname(pkg_path))
- versions = list(reversed(spack.package.find_versions_of_archive(url)))
+ versions = spack.package.find_versions_of_archive(url)
archives_to_fetch = 1
if not versions:
# If the fetch failed for some reason, revert to what the user provided
- versions = [version]
- urls = [url]
+ versions = { version : url }
else:
urls = [spack.url.substitute_version(url, v) for v in versions]
if len(urls) > 1:
@@ -181,6 +180,8 @@ def create(parser, args):
tty.msg("Aborted.")
return
+ sorted_versions = list(reversed(versions))
+
guesser = ConfigureGuesser()
ver_hash_tuples = spack.cmd.checksum.get_checksums(
versions[:archives_to_fetch], urls[:archives_to_fetch],
diff --git a/lib/spack/spack/cmd/versions.py b/lib/spack/spack/cmd/versions.py
index c545035279..ed16728261 100644
--- a/lib/spack/spack/cmd/versions.py
+++ b/lib/spack/spack/cmd/versions.py
@@ -24,6 +24,7 @@
##############################################################################
import os
from llnl.util.tty.colify import colify
+import llnl.util.tty as tty
import spack
description ="List available versions of a package"
@@ -34,4 +35,21 @@ def setup_parser(subparser):
def versions(parser, args):
pkg = spack.db.get(args.package)
- colify(reversed(pkg.fetch_available_versions()))
+
+ safe_versions = pkg.versions
+ fetched_versions = pkg.fetch_remote_versions()
+ remote_versions = set(fetched_versions).difference(safe_versions)
+
+ tty.msg("Safe versions (already checksummed):")
+ colify(sorted(safe_versions, reverse=True), indent=2)
+
+ tty.msg("Remote versions (not yet checksummed):")
+ if not remote_versions:
+ if not fetched_versions:
+ print " Found no versions for %s" % pkg.name
+ tty.debug("Check the list_url and list_depth attribute on the "
+ "package to help Spack find versions.")
+ else:
+ print " Found no unckecksummed versions for %s" % pkg.name
+ else:
+ colify(sorted(remote_versions, reverse=True), indent=2)
diff --git a/lib/spack/spack/concretize.py b/lib/spack/spack/concretize.py
index eee8cb7fde..805604368e 100644
--- a/lib/spack/spack/concretize.py
+++ b/lib/spack/spack/concretize.py
@@ -68,7 +68,7 @@ class DefaultConcretizer(object):
# If there are known avaialble versions, return the most recent
# version that satisfies the spec
pkg = spec.package
- valid_versions = [v for v in pkg.available_versions
+ valid_versions = [v for v in pkg.versions
if any(v.satisfies(sv) for sv in spec.versions)]
if valid_versions:
diff --git a/lib/spack/spack/package.py b/lib/spack/spack/package.py
index 7cf94ed1ef..58d62a7deb 100644
--- a/lib/spack/spack/package.py
+++ b/lib/spack/spack/package.py
@@ -39,7 +39,7 @@ import inspect
import subprocess
import platform as py_platform
import multiprocessing
-from urlparse import urlparse
+from urlparse import urlparse, urljoin
import llnl.util.tty as tty
from llnl.util.filesystem import *
@@ -333,9 +333,6 @@ class Package(object):
if '.' in self.name:
self.name = self.name[self.name.rindex('.') + 1:]
- # This is set by scraping a web page.
- self._available_versions = None
-
# Sanity check some required variables that could be
# overridden by package authors.
def ensure_has_dict(attr_name):
@@ -370,14 +367,15 @@ class Package(object):
# Init fetch strategy and url to None
self._fetcher = None
- self.url = None
+ self.url = getattr(self.__class__, 'url', None)
# Fix up self.url if this package fetches with a URLFetchStrategy.
# This makes self.url behave sanely.
if self.spec.versions.concrete:
- # TODO: this is a really roundabout way of determining the type of fetch to do.
- # TODO: figure out a more sane fetch strategy/package init order
- # TODO: (right now it's conflated with stage, package, and the tests make assumptions)
+ # TODO: this is a really roundabout way of determining the type
+ # TODO: of fetch to do. figure out a more sane fetch strategy/package
+ # TODO: init order (right now it's conflated with stage, package, and
+ # TODO: the tests make assumptions)
f = fs.for_package_version(self, self.version)
if isinstance(f, fs.URLFetchStrategy):
self.url = self.url_for_version(self.spec.version)
@@ -852,71 +850,70 @@ class Package(object):
self.stage.destroy()
- def fetch_available_versions(self):
- if not hasattr(self, 'url'):
- raise VersionFetchError(self.__class__)
-
- # If not, then try to fetch using list_url
- if not self._available_versions:
- try:
- self._available_versions = find_versions_of_archive(
- self.url,
- list_url=self.list_url,
- list_depth=self.list_depth)
-
- if not self._available_versions:
- tty.warn("Found no versions for %s" % self.name,
- "Check the list_url and list_depth attribute on the "
- + self.name + " package.",
- "Use them to tell Spack where to look for versions.")
+ @property
+ def all_urls(self):
+ urls = []
+ if self.url:
+ urls.append(self.url)
- except spack.error.NoNetworkConnectionError, e:
- tty.die("Package.fetch_available_versions couldn't connect to:",
- e.url, e.message)
+ for args in self.versions.values():
+ if 'url' in args:
+ urls.append(args['url'])
+ return urls
- return self._available_versions
+ def fetch_remote_versions(self):
+ """Try to find remote versions of this package using the
+ list_url and any other URLs described in the package file."""
+ if not self.all_urls:
+ raise VersionFetchError(self.__class__)
- @property
- def available_versions(self):
- # If the package overrode available_versions, then use that.
- if self.versions is not None:
- return VersionList(self.versions.keys())
- else:
- vlist = self.fetch_available_versions()
- if not vlist:
- vlist = ver([self.version])
- return vlist
+ try:
+ return find_versions_of_archive(
+ *self.all_urls, list_url=self.list_url, list_depth=self.list_depth)
+ except spack.error.NoNetworkConnectionError, e:
+ tty.die("Package.fetch_versions couldn't connect to:",
+ e.url, e.message)
-def find_versions_of_archive(archive_url, **kwargs):
+def find_versions_of_archive(*archive_urls, **kwargs):
list_url = kwargs.get('list_url', None)
list_depth = kwargs.get('list_depth', 1)
- if not list_url:
- list_url = url.find_list_url(archive_url)
-
- # This creates a regex from the URL with a capture group for the
- # version part of the URL. The capture group is converted to a
- # generic wildcard, so we can use this to extract things on a page
- # that look like archive URLs.
- url_regex = url.wildcard_version(archive_url)
-
- # We'll be a bit more liberal and just look for the archive part,
- # not the full path.
- archive_regex = os.path.basename(url_regex)
+ # Generate a list of list_urls based on archive urls and any
+ # explicitly listed list_url in the package
+ list_urls = set()
+ if list_url:
+ list_urls.add(list_url)
+ for aurl in archive_urls:
+ list_urls.add(url.find_list_url(aurl))
# Grab some web pages to scrape.
- page_map = get_pages(list_url, depth=list_depth)
+ page_map = {}
+ for lurl in list_urls:
+ page_map.update(get_pages(lurl, depth=list_depth))
+
+ # Scrape them for archive URLs
+ regexes = []
+ for aurl in archive_urls:
+ # This creates a regex from the URL with a capture group for
+ # the version part of the URL. The capture group is converted
+ # to a generic wildcard, so we can use this to extract things
+ # on a page that look like archive URLs.
+ url_regex = url.wildcard_version(aurl)
+
+ # We'll be a bit more liberal and just look for the archive
+ # part, not the full path.
+ regexes.append(os.path.basename(url_regex))
# Build a version list from all the matches we find
- versions = VersionList()
- for site, page in page_map.iteritems():
+ versions = {}
+ for page_url, content in page_map.iteritems():
# extract versions from matches.
- matches = re.finditer(archive_regex, page)
- version_strings = set(m.group(1) for m in matches)
- for v in version_strings:
- versions.add(Version(v))
+ for regex in regexes:
+ versions.update(
+ (Version(m.group(1)), urljoin(page_url, m.group(0)))
+ for m in re.finditer(regex, content))
return versions
@@ -979,8 +976,8 @@ class VersionFetchError(PackageError):
"""Raised when a version URL cannot automatically be determined."""
def __init__(self, cls):
super(VersionFetchError, self).__init__(
- "Cannot fetch version for package %s " % cls.__name__ +
- "because it does not define a default url.")
+ "Cannot fetch versions for package %s " % cls.__name__ +
+ "because it does not define any URLs to fetch.")
class NoURLError(PackageError):
diff --git a/lib/spack/spack/url.py b/lib/spack/spack/url.py
index e2fbb19f5d..a0410131b0 100644
--- a/lib/spack/spack/url.py
+++ b/lib/spack/spack/url.py
@@ -245,6 +245,10 @@ def wildcard_version(path):
# Construct a case-insensitive regular expression for the package name.
name_re = '(%s)' % insensitize(name)
+ # protect extensions like bz2 from wildcarding.
+ ext = comp.extension(path)
+ path = comp.strip_extension(path)
+
# Split the string apart by things that match the name so that if the
# name contains numbers or things that look like versions, we don't
# catch them with the version wildcard.
@@ -261,4 +265,4 @@ def wildcard_version(path):
name_parts[i] = vgroup.join(re.escape(vp) for vp in vparts)
# Put it all back together with original name matches intact.
- return ''.join(name_parts)
+ return ''.join(name_parts) + '.' + ext
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index ba42cb37b5..1420d62a77 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -25,7 +25,7 @@
import re
import sys
import subprocess
-import urllib2
+import urllib2, cookielib
import urlparse
from multiprocessing import Pool
from HTMLParser import HTMLParser, HTMLParseError
@@ -68,7 +68,7 @@ def _spider(args):
pool. Firing off all the child links at once makes the fetch MUCH
faster for pages with lots of children.
"""
- url, depth, max_depth, raise_on_error = args
+ url, visited, root, opener, depth, max_depth, raise_on_error = args
pages = {}
try:
@@ -82,12 +82,12 @@ def _spider(args):
resp = urllib2.urlopen(req, timeout=TIMEOUT)
if not "Content-type" in resp.headers:
- tty.warn("ignoring page " + url)
+ tty.debug("ignoring page " + url)
return pages
if not resp.headers["Content-type"].startswith('text/html'):
- tty.warn("ignoring page " + url + " with content type " +
- resp.headers["Content-type"])
+ tty.debug("ignoring page " + url + " with content type " +
+ resp.headers["Content-type"])
return pages
# Do the real GET request when we know it's just HTML.
@@ -114,15 +114,30 @@ def _spider(args):
# Evaluate the link relative to the page it came from.
abs_link = urlparse.urljoin(response_url, raw_link)
- subcalls.append((abs_link, depth+1, max_depth, raise_on_error))
+
+ # Skip things outside the root directory
+ if not abs_link.startswith(root):
+ continue
+
+ # Skip already-visited links
+ if abs_link in visited:
+ continue
+
+ subcalls.append((abs_link, visited, root, None, depth+1, max_depth, raise_on_error))
+ visited.add(abs_link)
if subcalls:
- pool = Pool(processes=len(subcalls))
- dicts = pool.map(_spider, subcalls)
- for d in dicts:
- pages.update(d)
+ try:
+ pool = Pool(processes=len(subcalls))
+ dicts = pool.map(_spider, subcalls)
+ for d in dicts:
+ pages.update(d)
+ finally:
+ pool.terminate()
+ pool.join()
except urllib2.URLError, e:
+ tty.debug(e)
if raise_on_error:
raise spack.error.NoNetworkConnectionError(str(e), url)
@@ -137,7 +152,8 @@ def _spider(args):
tty.warn(msg, url, "HTMLParseError: " + str(e))
except Exception, e:
- pass # Other types of errors are completely ignored.
+ # Other types of errors are completely ignored, except in debug mode.
+ tty.debug("Error in _spider: %s" % e)
return pages
@@ -151,5 +167,5 @@ def get_pages(root_url, **kwargs):
performance over a sequential fetch.
"""
max_depth = kwargs.setdefault('depth', 1)
- pages = _spider((root_url, 1, max_depth, False))
+ pages = _spider((root_url, set(), root_url, None, 1, max_depth, False))
return pages