From 3bbca9bd05b0032d170f3e86a594b68157a66472 Mon Sep 17 00:00:00 2001 From: Todd Gamblin Date: Fri, 25 Apr 2014 14:41:37 -0700 Subject: Better version wildcard handling, better spidering - Allow version wildcards to match [_-.] instead of the exact separators the version was constructed with. - Handles the fact that boost versions are written both 1.55.0 and 1_55_0. - Update spidering to handle parse errors and warn that Python < 2.7.3 has less robust HTML parsing abilities. --- .gitignore | 2 +- lib/spack/spack/error.py | 4 +++- lib/spack/spack/url.py | 2 +- lib/spack/spack/util/web.py | 34 ++++++++++++++++++++++------------ lib/spack/spack/version.py | 19 +++++++++++-------- 5 files changed, 38 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 0e239fa0bb..7010bf7ede 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ +/var/spack/stage *.pyc /opt/ -/var/ *~ .DS_Store .idea diff --git a/lib/spack/spack/error.py b/lib/spack/spack/error.py index 47fb858f3f..40e0e75fdb 100644 --- a/lib/spack/spack/error.py +++ b/lib/spack/spack/error.py @@ -41,5 +41,7 @@ class UnsupportedPlatformError(SpackError): class NoNetworkConnectionError(SpackError): """Raised when an operation needs an internet connection.""" def __init__(self, message, url): - super(NoNetworkConnectionError, self).__init__(message) + super(NoNetworkConnectionError, self).__init__( + "No network connection: " + str(message), + "URL was: " + str(url)) self.url = url diff --git a/lib/spack/spack/url.py b/lib/spack/spack/url.py index f56aaee493..deac156571 100644 --- a/lib/spack/spack/url.py +++ b/lib/spack/spack/url.py @@ -206,7 +206,7 @@ def wildcard_version(path): ver, start, end = parse_version_string_with_indices(path) v = Version(ver) - parts = list(re.escape(p) for p in path.split(str(v))) + parts = [re.escape(p) for p in re.split(v.wildcard(), path)] # Make a group for the wildcard, so it will be captured by the regex. version_group = '(%s)' % v.wildcard() diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index b5104eb076..ba42cb37b5 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -23,11 +23,12 @@ # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ############################################################################## import re +import sys import subprocess import urllib2 import urlparse from multiprocessing import Pool -from HTMLParser import HTMLParser +from HTMLParser import HTMLParser, HTMLParseError import llnl.util.tty as tty @@ -67,7 +68,7 @@ def _spider(args): pool. Firing off all the child links at once makes the fetch MUCH faster for pages with lots of children. """ - url, depth, max_depth = args + url, depth, max_depth, raise_on_error = args pages = {} try: @@ -81,11 +82,12 @@ def _spider(args): resp = urllib2.urlopen(req, timeout=TIMEOUT) if not "Content-type" in resp.headers: - print "ignoring page " + url + tty.warn("ignoring page " + url) return pages if not resp.headers["Content-type"].startswith('text/html'): - print "ignoring page " + url + " with content type " + resp.headers["Content-type"] + tty.warn("ignoring page " + url + " with content type " + + resp.headers["Content-type"]) return pages # Do the real GET request when we know it's just HTML. @@ -100,9 +102,9 @@ def _spider(args): # If we're not at max depth, parse out the links in the page if depth < max_depth: link_parser = LinkParser() - subcalls = [] link_parser.feed(page) + while link_parser.links: raw_link = link_parser.links.pop() @@ -112,7 +114,7 @@ def _spider(args): # Evaluate the link relative to the page it came from. abs_link = urlparse.urljoin(response_url, raw_link) - subcalls.append((abs_link, depth+1, max_depth)) + subcalls.append((abs_link, depth+1, max_depth, raise_on_error)) if subcalls: pool = Pool(processes=len(subcalls)) @@ -121,13 +123,21 @@ def _spider(args): pages.update(d) except urllib2.URLError, e: - # Only report it if it's the root page. We ignore errors when spidering. - if depth == 1: - raise spack.error.NoNetworkConnectionError(e.reason, url) + if raise_on_error: + raise spack.error.NoNetworkConnectionError(str(e), url) + + except HTMLParseError, e: + # This error indicates that Python's HTML parser sucks. + msg = "Got an error parsing HTML." + + # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing. + if sys.version_info[:3] < (2,7,3): + msg += " Use Python 2.7.3 or newer for better HTML parsing." + + tty.warn(msg, url, "HTMLParseError: " + str(e)) except Exception, e: - # Other types of errors are completely ignored. - pass + pass # Other types of errors are completely ignored. return pages @@ -141,5 +151,5 @@ def get_pages(root_url, **kwargs): performance over a sequential fetch. """ max_depth = kwargs.setdefault('depth', 1) - pages = _spider((root_url, 1, max_depth)) + pages = _spider((root_url, 1, max_depth, False)) return pages diff --git a/lib/spack/spack/version.py b/lib/spack/spack/version.py index 1f44c5f39b..0b5125fdf0 100644 --- a/lib/spack/spack/version.py +++ b/lib/spack/spack/version.py @@ -152,21 +152,24 @@ class Version(object): return r'[a-zA-Z]+' version = self.version - separators = ('',) + self.separators + + # Use a wildcard for separators, in case a version is written + # two different ways (e.g., boost writes 1_55_0 and 1.55.0) + sep_re = '[_.-]' + separators = ('',) + (sep_re,) * len(self.separators) version += (version[-1],) * 2 - separators += (separators[-1],) * 2 + separators += (sep_re,) * 2 - sep_res = [re.escape(sep) for sep in separators] - seg_res = [a_or_n(seg) for seg in version] + segments = [a_or_n(seg) for seg in version] - wc = seg_res[0] - for i in xrange(1, len(sep_res)): - wc += '(?:' + sep_res[i] + seg_res[i] + wc = segments[0] + for i in xrange(1, len(separators)): + wc += '(?:' + separators[i] + segments[i] # Add possible alpha or beta indicator at the end of each segemnt # We treat these specially b/c they're so common. - wc += '[ab]?)?' * (len(seg_res) - 1) + wc += '[ab]?)?' * (len(segments) - 1) return wc -- cgit v1.2.3-60-g2f50