summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTodd Gamblin <tgamblin@llnl.gov>2014-04-25 14:41:37 -0700
committerTodd Gamblin <tgamblin@llnl.gov>2014-04-25 14:41:37 -0700
commit3bbca9bd05b0032d170f3e86a594b68157a66472 (patch)
tree7934465364113819eca3573c107e990ac31c20a8
parent15589754ecfee76ae6972eeebfe5d888172e5b9b (diff)
downloadspack-3bbca9bd05b0032d170f3e86a594b68157a66472.tar.gz
spack-3bbca9bd05b0032d170f3e86a594b68157a66472.tar.bz2
spack-3bbca9bd05b0032d170f3e86a594b68157a66472.tar.xz
spack-3bbca9bd05b0032d170f3e86a594b68157a66472.zip
Better version wildcard handling, better spidering
- Allow version wildcards to match [_-.] instead of the exact separators the version was constructed with. - Handles the fact that boost versions are written both 1.55.0 and 1_55_0. - Update spidering to handle parse errors and warn that Python < 2.7.3 has less robust HTML parsing abilities.
-rw-r--r--.gitignore2
-rw-r--r--lib/spack/spack/error.py4
-rw-r--r--lib/spack/spack/url.py2
-rw-r--r--lib/spack/spack/util/web.py34
-rw-r--r--lib/spack/spack/version.py19
5 files changed, 38 insertions, 23 deletions
diff --git a/.gitignore b/.gitignore
index 0e239fa0bb..7010bf7ede 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
+/var/spack/stage
*.pyc
/opt/
-/var/
*~
.DS_Store
.idea
diff --git a/lib/spack/spack/error.py b/lib/spack/spack/error.py
index 47fb858f3f..40e0e75fdb 100644
--- a/lib/spack/spack/error.py
+++ b/lib/spack/spack/error.py
@@ -41,5 +41,7 @@ class UnsupportedPlatformError(SpackError):
class NoNetworkConnectionError(SpackError):
"""Raised when an operation needs an internet connection."""
def __init__(self, message, url):
- super(NoNetworkConnectionError, self).__init__(message)
+ super(NoNetworkConnectionError, self).__init__(
+ "No network connection: " + str(message),
+ "URL was: " + str(url))
self.url = url
diff --git a/lib/spack/spack/url.py b/lib/spack/spack/url.py
index f56aaee493..deac156571 100644
--- a/lib/spack/spack/url.py
+++ b/lib/spack/spack/url.py
@@ -206,7 +206,7 @@ def wildcard_version(path):
ver, start, end = parse_version_string_with_indices(path)
v = Version(ver)
- parts = list(re.escape(p) for p in path.split(str(v)))
+ parts = [re.escape(p) for p in re.split(v.wildcard(), path)]
# Make a group for the wildcard, so it will be captured by the regex.
version_group = '(%s)' % v.wildcard()
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index b5104eb076..ba42cb37b5 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -23,11 +23,12 @@
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
##############################################################################
import re
+import sys
import subprocess
import urllib2
import urlparse
from multiprocessing import Pool
-from HTMLParser import HTMLParser
+from HTMLParser import HTMLParser, HTMLParseError
import llnl.util.tty as tty
@@ -67,7 +68,7 @@ def _spider(args):
pool. Firing off all the child links at once makes the fetch MUCH
faster for pages with lots of children.
"""
- url, depth, max_depth = args
+ url, depth, max_depth, raise_on_error = args
pages = {}
try:
@@ -81,11 +82,12 @@ def _spider(args):
resp = urllib2.urlopen(req, timeout=TIMEOUT)
if not "Content-type" in resp.headers:
- print "ignoring page " + url
+ tty.warn("ignoring page " + url)
return pages
if not resp.headers["Content-type"].startswith('text/html'):
- print "ignoring page " + url + " with content type " + resp.headers["Content-type"]
+ tty.warn("ignoring page " + url + " with content type " +
+ resp.headers["Content-type"])
return pages
# Do the real GET request when we know it's just HTML.
@@ -100,9 +102,9 @@ def _spider(args):
# If we're not at max depth, parse out the links in the page
if depth < max_depth:
link_parser = LinkParser()
-
subcalls = []
link_parser.feed(page)
+
while link_parser.links:
raw_link = link_parser.links.pop()
@@ -112,7 +114,7 @@ def _spider(args):
# Evaluate the link relative to the page it came from.
abs_link = urlparse.urljoin(response_url, raw_link)
- subcalls.append((abs_link, depth+1, max_depth))
+ subcalls.append((abs_link, depth+1, max_depth, raise_on_error))
if subcalls:
pool = Pool(processes=len(subcalls))
@@ -121,13 +123,21 @@ def _spider(args):
pages.update(d)
except urllib2.URLError, e:
- # Only report it if it's the root page. We ignore errors when spidering.
- if depth == 1:
- raise spack.error.NoNetworkConnectionError(e.reason, url)
+ if raise_on_error:
+ raise spack.error.NoNetworkConnectionError(str(e), url)
+
+ except HTMLParseError, e:
+ # This error indicates that Python's HTML parser sucks.
+ msg = "Got an error parsing HTML."
+
+ # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
+ if sys.version_info[:3] < (2,7,3):
+ msg += " Use Python 2.7.3 or newer for better HTML parsing."
+
+ tty.warn(msg, url, "HTMLParseError: " + str(e))
except Exception, e:
- # Other types of errors are completely ignored.
- pass
+ pass # Other types of errors are completely ignored.
return pages
@@ -141,5 +151,5 @@ def get_pages(root_url, **kwargs):
performance over a sequential fetch.
"""
max_depth = kwargs.setdefault('depth', 1)
- pages = _spider((root_url, 1, max_depth))
+ pages = _spider((root_url, 1, max_depth, False))
return pages
diff --git a/lib/spack/spack/version.py b/lib/spack/spack/version.py
index 1f44c5f39b..0b5125fdf0 100644
--- a/lib/spack/spack/version.py
+++ b/lib/spack/spack/version.py
@@ -152,21 +152,24 @@ class Version(object):
return r'[a-zA-Z]+'
version = self.version
- separators = ('',) + self.separators
+
+ # Use a wildcard for separators, in case a version is written
+ # two different ways (e.g., boost writes 1_55_0 and 1.55.0)
+ sep_re = '[_.-]'
+ separators = ('',) + (sep_re,) * len(self.separators)
version += (version[-1],) * 2
- separators += (separators[-1],) * 2
+ separators += (sep_re,) * 2
- sep_res = [re.escape(sep) for sep in separators]
- seg_res = [a_or_n(seg) for seg in version]
+ segments = [a_or_n(seg) for seg in version]
- wc = seg_res[0]
- for i in xrange(1, len(sep_res)):
- wc += '(?:' + sep_res[i] + seg_res[i]
+ wc = segments[0]
+ for i in xrange(1, len(separators)):
+ wc += '(?:' + separators[i] + segments[i]
# Add possible alpha or beta indicator at the end of each segemnt
# We treat these specially b/c they're so common.
- wc += '[ab]?)?' * (len(seg_res) - 1)
+ wc += '[ab]?)?' * (len(segments) - 1)
return wc