diff options
author | Todd Gamblin <tgamblin@llnl.gov> | 2014-11-07 00:17:25 -0800 |
---|---|---|
committer | Todd Gamblin <tgamblin@llnl.gov> | 2014-11-07 00:17:25 -0800 |
commit | 55bf243f166e38451026813fa2bdc2c90263d1aa (patch) | |
tree | 70b7390b1423ea8dc2f8ff75753b329e1e227042 | |
parent | d78ece658b0b139604998886a788acc11e661b14 (diff) | |
download | spack-55bf243f166e38451026813fa2bdc2c90263d1aa.tar.gz spack-55bf243f166e38451026813fa2bdc2c90263d1aa.tar.bz2 spack-55bf243f166e38451026813fa2bdc2c90263d1aa.tar.xz spack-55bf243f166e38451026813fa2bdc2c90263d1aa.zip |
Improved website scraping.
-rwxr-xr-x | bin/spack | 1 | ||||
-rw-r--r-- | lib/spack/spack/cmd/checksum.py | 34 | ||||
-rw-r--r-- | lib/spack/spack/cmd/create.py | 7 | ||||
-rw-r--r-- | lib/spack/spack/cmd/versions.py | 20 | ||||
-rw-r--r-- | lib/spack/spack/concretize.py | 2 | ||||
-rw-r--r-- | lib/spack/spack/package.py | 119 | ||||
-rw-r--r-- | lib/spack/spack/url.py | 6 | ||||
-rw-r--r-- | lib/spack/spack/util/web.py | 40 |
8 files changed, 134 insertions, 95 deletions
@@ -113,4 +113,5 @@ except SpackError, e: tty.die(e.message) except KeyboardInterrupt: + sys.stderr.write('\n') tty.die("Keyboard interrupt.") diff --git a/lib/spack/spack/cmd/checksum.py b/lib/spack/spack/cmd/checksum.py index f9218b9df1..3f2a9aa745 100644 --- a/lib/spack/spack/cmd/checksum.py +++ b/lib/spack/spack/cmd/checksum.py @@ -85,24 +85,24 @@ def checksum(parser, args): pkg = spack.db.get(args.package) # If the user asked for specific versions, use those. - versions = [ver(v) for v in args.versions] - - if not all(type(v) == Version for v in versions): - tty.die("Cannot generate checksums for version lists or " + - "version ranges. Use unambiguous versions.") - - if not versions: - versions = pkg.fetch_available_versions() + if args.versions: + versions = {} + for v in args.versions: + v = ver(v) + if not isinstance(v, Version): + tty.die("Cannot generate checksums for version lists or " + + "version ranges. Use unambiguous versions.") + versions[v] = pkg.url_for_version(v) + else: + versions = pkg.fetch_remote_versions() if not versions: - tty.die("Could not fetch any available versions for %s." % pkg.name) - - versions = list(reversed(sorted(versions))) - urls = [pkg.url_for_version(v) for v in versions] + tty.die("Could not fetch any versions for %s." % pkg.name) + sorted_versions = list(reversed(sorted(versions))) - tty.msg("Found %s versions of %s." % (len(urls), pkg.name), + tty.msg("Found %s versions of %s." % (len(versions), pkg.name), *spack.cmd.elide_list( - ["%-10s%s" % (v,u) for v, u in zip(versions, urls)])) + ["%-10s%s" % (v, versions[v]) for v in sorted_versions])) print archives_to_fetch = tty.get_number( "How many would you like to checksum?", default=5, abort='q') @@ -112,10 +112,12 @@ def checksum(parser, args): return version_hashes = get_checksums( - versions[:archives_to_fetch], urls[:archives_to_fetch], keep_stage=args.keep_stage) + sorted_versions[:archives_to_fetch], + [versions[v] for v in sorted_versions[:archives_to_fetch]], + keep_stage=args.keep_stage) if not version_hashes: - tty.die("Could not fetch any available versions for %s." % pkg.name) + tty.die("Could not fetch any versions for %s." % pkg.name) version_lines = [" version('%s', '%s')" % (v, h) for v, h in version_hashes] tty.msg("Checksummed new versions of %s:" % pkg.name, *version_lines) diff --git a/lib/spack/spack/cmd/create.py b/lib/spack/spack/cmd/create.py index 7ac10285a4..1b9ad524c4 100644 --- a/lib/spack/spack/cmd/create.py +++ b/lib/spack/spack/cmd/create.py @@ -159,13 +159,12 @@ def create(parser, args): else: mkdirp(os.path.dirname(pkg_path)) - versions = list(reversed(spack.package.find_versions_of_archive(url))) + versions = spack.package.find_versions_of_archive(url) archives_to_fetch = 1 if not versions: # If the fetch failed for some reason, revert to what the user provided - versions = [version] - urls = [url] + versions = { version : url } else: urls = [spack.url.substitute_version(url, v) for v in versions] if len(urls) > 1: @@ -181,6 +180,8 @@ def create(parser, args): tty.msg("Aborted.") return + sorted_versions = list(reversed(versions)) + guesser = ConfigureGuesser() ver_hash_tuples = spack.cmd.checksum.get_checksums( versions[:archives_to_fetch], urls[:archives_to_fetch], diff --git a/lib/spack/spack/cmd/versions.py b/lib/spack/spack/cmd/versions.py index c545035279..ed16728261 100644 --- a/lib/spack/spack/cmd/versions.py +++ b/lib/spack/spack/cmd/versions.py @@ -24,6 +24,7 @@ ############################################################################## import os from llnl.util.tty.colify import colify +import llnl.util.tty as tty import spack description ="List available versions of a package" @@ -34,4 +35,21 @@ def setup_parser(subparser): def versions(parser, args): pkg = spack.db.get(args.package) - colify(reversed(pkg.fetch_available_versions())) + + safe_versions = pkg.versions + fetched_versions = pkg.fetch_remote_versions() + remote_versions = set(fetched_versions).difference(safe_versions) + + tty.msg("Safe versions (already checksummed):") + colify(sorted(safe_versions, reverse=True), indent=2) + + tty.msg("Remote versions (not yet checksummed):") + if not remote_versions: + if not fetched_versions: + print " Found no versions for %s" % pkg.name + tty.debug("Check the list_url and list_depth attribute on the " + "package to help Spack find versions.") + else: + print " Found no unckecksummed versions for %s" % pkg.name + else: + colify(sorted(remote_versions, reverse=True), indent=2) diff --git a/lib/spack/spack/concretize.py b/lib/spack/spack/concretize.py index eee8cb7fde..805604368e 100644 --- a/lib/spack/spack/concretize.py +++ b/lib/spack/spack/concretize.py @@ -68,7 +68,7 @@ class DefaultConcretizer(object): # If there are known avaialble versions, return the most recent # version that satisfies the spec pkg = spec.package - valid_versions = [v for v in pkg.available_versions + valid_versions = [v for v in pkg.versions if any(v.satisfies(sv) for sv in spec.versions)] if valid_versions: diff --git a/lib/spack/spack/package.py b/lib/spack/spack/package.py index 7cf94ed1ef..58d62a7deb 100644 --- a/lib/spack/spack/package.py +++ b/lib/spack/spack/package.py @@ -39,7 +39,7 @@ import inspect import subprocess import platform as py_platform import multiprocessing -from urlparse import urlparse +from urlparse import urlparse, urljoin import llnl.util.tty as tty from llnl.util.filesystem import * @@ -333,9 +333,6 @@ class Package(object): if '.' in self.name: self.name = self.name[self.name.rindex('.') + 1:] - # This is set by scraping a web page. - self._available_versions = None - # Sanity check some required variables that could be # overridden by package authors. def ensure_has_dict(attr_name): @@ -370,14 +367,15 @@ class Package(object): # Init fetch strategy and url to None self._fetcher = None - self.url = None + self.url = getattr(self.__class__, 'url', None) # Fix up self.url if this package fetches with a URLFetchStrategy. # This makes self.url behave sanely. if self.spec.versions.concrete: - # TODO: this is a really roundabout way of determining the type of fetch to do. - # TODO: figure out a more sane fetch strategy/package init order - # TODO: (right now it's conflated with stage, package, and the tests make assumptions) + # TODO: this is a really roundabout way of determining the type + # TODO: of fetch to do. figure out a more sane fetch strategy/package + # TODO: init order (right now it's conflated with stage, package, and + # TODO: the tests make assumptions) f = fs.for_package_version(self, self.version) if isinstance(f, fs.URLFetchStrategy): self.url = self.url_for_version(self.spec.version) @@ -852,71 +850,70 @@ class Package(object): self.stage.destroy() - def fetch_available_versions(self): - if not hasattr(self, 'url'): - raise VersionFetchError(self.__class__) - - # If not, then try to fetch using list_url - if not self._available_versions: - try: - self._available_versions = find_versions_of_archive( - self.url, - list_url=self.list_url, - list_depth=self.list_depth) - - if not self._available_versions: - tty.warn("Found no versions for %s" % self.name, - "Check the list_url and list_depth attribute on the " - + self.name + " package.", - "Use them to tell Spack where to look for versions.") + @property + def all_urls(self): + urls = [] + if self.url: + urls.append(self.url) - except spack.error.NoNetworkConnectionError, e: - tty.die("Package.fetch_available_versions couldn't connect to:", - e.url, e.message) + for args in self.versions.values(): + if 'url' in args: + urls.append(args['url']) + return urls - return self._available_versions + def fetch_remote_versions(self): + """Try to find remote versions of this package using the + list_url and any other URLs described in the package file.""" + if not self.all_urls: + raise VersionFetchError(self.__class__) - @property - def available_versions(self): - # If the package overrode available_versions, then use that. - if self.versions is not None: - return VersionList(self.versions.keys()) - else: - vlist = self.fetch_available_versions() - if not vlist: - vlist = ver([self.version]) - return vlist + try: + return find_versions_of_archive( + *self.all_urls, list_url=self.list_url, list_depth=self.list_depth) + except spack.error.NoNetworkConnectionError, e: + tty.die("Package.fetch_versions couldn't connect to:", + e.url, e.message) -def find_versions_of_archive(archive_url, **kwargs): +def find_versions_of_archive(*archive_urls, **kwargs): list_url = kwargs.get('list_url', None) list_depth = kwargs.get('list_depth', 1) - if not list_url: - list_url = url.find_list_url(archive_url) - - # This creates a regex from the URL with a capture group for the - # version part of the URL. The capture group is converted to a - # generic wildcard, so we can use this to extract things on a page - # that look like archive URLs. - url_regex = url.wildcard_version(archive_url) - - # We'll be a bit more liberal and just look for the archive part, - # not the full path. - archive_regex = os.path.basename(url_regex) + # Generate a list of list_urls based on archive urls and any + # explicitly listed list_url in the package + list_urls = set() + if list_url: + list_urls.add(list_url) + for aurl in archive_urls: + list_urls.add(url.find_list_url(aurl)) # Grab some web pages to scrape. - page_map = get_pages(list_url, depth=list_depth) + page_map = {} + for lurl in list_urls: + page_map.update(get_pages(lurl, depth=list_depth)) + + # Scrape them for archive URLs + regexes = [] + for aurl in archive_urls: + # This creates a regex from the URL with a capture group for + # the version part of the URL. The capture group is converted + # to a generic wildcard, so we can use this to extract things + # on a page that look like archive URLs. + url_regex = url.wildcard_version(aurl) + + # We'll be a bit more liberal and just look for the archive + # part, not the full path. + regexes.append(os.path.basename(url_regex)) # Build a version list from all the matches we find - versions = VersionList() - for site, page in page_map.iteritems(): + versions = {} + for page_url, content in page_map.iteritems(): # extract versions from matches. - matches = re.finditer(archive_regex, page) - version_strings = set(m.group(1) for m in matches) - for v in version_strings: - versions.add(Version(v)) + for regex in regexes: + versions.update( + (Version(m.group(1)), urljoin(page_url, m.group(0))) + for m in re.finditer(regex, content)) return versions @@ -979,8 +976,8 @@ class VersionFetchError(PackageError): """Raised when a version URL cannot automatically be determined.""" def __init__(self, cls): super(VersionFetchError, self).__init__( - "Cannot fetch version for package %s " % cls.__name__ + - "because it does not define a default url.") + "Cannot fetch versions for package %s " % cls.__name__ + + "because it does not define any URLs to fetch.") class NoURLError(PackageError): diff --git a/lib/spack/spack/url.py b/lib/spack/spack/url.py index e2fbb19f5d..a0410131b0 100644 --- a/lib/spack/spack/url.py +++ b/lib/spack/spack/url.py @@ -245,6 +245,10 @@ def wildcard_version(path): # Construct a case-insensitive regular expression for the package name. name_re = '(%s)' % insensitize(name) + # protect extensions like bz2 from wildcarding. + ext = comp.extension(path) + path = comp.strip_extension(path) + # Split the string apart by things that match the name so that if the # name contains numbers or things that look like versions, we don't # catch them with the version wildcard. @@ -261,4 +265,4 @@ def wildcard_version(path): name_parts[i] = vgroup.join(re.escape(vp) for vp in vparts) # Put it all back together with original name matches intact. - return ''.join(name_parts) + return ''.join(name_parts) + '.' + ext diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index ba42cb37b5..1420d62a77 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -25,7 +25,7 @@ import re import sys import subprocess -import urllib2 +import urllib2, cookielib import urlparse from multiprocessing import Pool from HTMLParser import HTMLParser, HTMLParseError @@ -68,7 +68,7 @@ def _spider(args): pool. Firing off all the child links at once makes the fetch MUCH faster for pages with lots of children. """ - url, depth, max_depth, raise_on_error = args + url, visited, root, opener, depth, max_depth, raise_on_error = args pages = {} try: @@ -82,12 +82,12 @@ def _spider(args): resp = urllib2.urlopen(req, timeout=TIMEOUT) if not "Content-type" in resp.headers: - tty.warn("ignoring page " + url) + tty.debug("ignoring page " + url) return pages if not resp.headers["Content-type"].startswith('text/html'): - tty.warn("ignoring page " + url + " with content type " + - resp.headers["Content-type"]) + tty.debug("ignoring page " + url + " with content type " + + resp.headers["Content-type"]) return pages # Do the real GET request when we know it's just HTML. @@ -114,15 +114,30 @@ def _spider(args): # Evaluate the link relative to the page it came from. abs_link = urlparse.urljoin(response_url, raw_link) - subcalls.append((abs_link, depth+1, max_depth, raise_on_error)) + + # Skip things outside the root directory + if not abs_link.startswith(root): + continue + + # Skip already-visited links + if abs_link in visited: + continue + + subcalls.append((abs_link, visited, root, None, depth+1, max_depth, raise_on_error)) + visited.add(abs_link) if subcalls: - pool = Pool(processes=len(subcalls)) - dicts = pool.map(_spider, subcalls) - for d in dicts: - pages.update(d) + try: + pool = Pool(processes=len(subcalls)) + dicts = pool.map(_spider, subcalls) + for d in dicts: + pages.update(d) + finally: + pool.terminate() + pool.join() except urllib2.URLError, e: + tty.debug(e) if raise_on_error: raise spack.error.NoNetworkConnectionError(str(e), url) @@ -137,7 +152,8 @@ def _spider(args): tty.warn(msg, url, "HTMLParseError: " + str(e)) except Exception, e: - pass # Other types of errors are completely ignored. + # Other types of errors are completely ignored, except in debug mode. + tty.debug("Error in _spider: %s" % e) return pages @@ -151,5 +167,5 @@ def get_pages(root_url, **kwargs): performance over a sequential fetch. """ max_depth = kwargs.setdefault('depth', 1) - pages = _spider((root_url, 1, max_depth, False)) + pages = _spider((root_url, set(), root_url, None, 1, max_depth, False)) return pages |