From 3bbca9bd05b0032d170f3e86a594b68157a66472 Mon Sep 17 00:00:00 2001
From: Todd Gamblin <tgamblin@llnl.gov>
Date: Fri, 25 Apr 2014 14:41:37 -0700
Subject: Better version wildcard handling, better spidering

- Allow version wildcards to match [_-.] instead of the exact separators
  the version was constructed with.
  - Handles the fact that boost versions are written both 1.55.0 and 1_55_0.

- Update spidering to handle parse errors and warn that Python < 2.7.3 has
  less robust HTML parsing abilities.
---
 .gitignore                  |  2 +-
 lib/spack/spack/error.py    |  4 +++-
 lib/spack/spack/url.py      |  2 +-
 lib/spack/spack/util/web.py | 34 ++++++++++++++++++++++------------
 lib/spack/spack/version.py  | 19 +++++++++++--------
 5 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0e239fa0bb..7010bf7ede 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
+/var/spack/stage
 *.pyc
 /opt/
-/var/
 *~
 .DS_Store
 .idea
diff --git a/lib/spack/spack/error.py b/lib/spack/spack/error.py
index 47fb858f3f..40e0e75fdb 100644
--- a/lib/spack/spack/error.py
+++ b/lib/spack/spack/error.py
@@ -41,5 +41,7 @@ class UnsupportedPlatformError(SpackError):
 class NoNetworkConnectionError(SpackError):
     """Raised when an operation needs an internet connection."""
     def __init__(self, message, url):
-        super(NoNetworkConnectionError, self).__init__(message)
+        super(NoNetworkConnectionError, self).__init__(
+            "No network connection: " + str(message),
+            "URL was: " + str(url))
         self.url = url
diff --git a/lib/spack/spack/url.py b/lib/spack/spack/url.py
index f56aaee493..deac156571 100644
--- a/lib/spack/spack/url.py
+++ b/lib/spack/spack/url.py
@@ -206,7 +206,7 @@ def wildcard_version(path):
     ver, start, end = parse_version_string_with_indices(path)
 
     v = Version(ver)
-    parts = list(re.escape(p) for p in path.split(str(v)))
+    parts = [re.escape(p) for p in re.split(v.wildcard(), path)]
 
     # Make a group for the wildcard, so it will be captured by the regex.
     version_group = '(%s)' % v.wildcard()
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index b5104eb076..ba42cb37b5 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -23,11 +23,12 @@
 # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 ##############################################################################
 import re
+import sys
 import subprocess
 import urllib2
 import urlparse
 from multiprocessing import Pool
-from HTMLParser import HTMLParser
+from HTMLParser import HTMLParser, HTMLParseError
 
 import llnl.util.tty as tty
 
@@ -67,7 +68,7 @@ def _spider(args):
        pool.  Firing off all the child links at once makes the fetch MUCH
        faster for pages with lots of children.
     """
-    url, depth, max_depth = args
+    url, depth, max_depth, raise_on_error = args
 
     pages = {}
     try:
@@ -81,11 +82,12 @@ def _spider(args):
         resp = urllib2.urlopen(req, timeout=TIMEOUT)
 
         if not "Content-type" in resp.headers:
-            print "ignoring page " + url
+            tty.warn("ignoring page " + url)
             return pages
 
         if not resp.headers["Content-type"].startswith('text/html'):
-            print "ignoring page " + url + " with content type " + resp.headers["Content-type"]
+            tty.warn("ignoring page " + url + " with content type " +
+                     resp.headers["Content-type"])
             return pages
 
         # Do the real GET request when we know it's just HTML.
@@ -100,9 +102,9 @@ def _spider(args):
         # If we're not at max depth, parse out the links in the page
         if depth < max_depth:
             link_parser = LinkParser()
-
             subcalls = []
             link_parser.feed(page)
+
             while link_parser.links:
                 raw_link = link_parser.links.pop()
 
@@ -112,7 +114,7 @@ def _spider(args):
 
                 # Evaluate the link relative to the page it came from.
                 abs_link = urlparse.urljoin(response_url, raw_link)
-                subcalls.append((abs_link, depth+1, max_depth))
+                subcalls.append((abs_link, depth+1, max_depth, raise_on_error))
 
             if subcalls:
                 pool = Pool(processes=len(subcalls))
@@ -121,13 +123,21 @@ def _spider(args):
                     pages.update(d)
 
     except urllib2.URLError, e:
-        # Only report it if it's the root page.  We ignore errors when spidering.
-        if depth == 1:
-            raise spack.error.NoNetworkConnectionError(e.reason, url)
+        if raise_on_error:
+            raise spack.error.NoNetworkConnectionError(str(e), url)
+
+    except HTMLParseError, e:
+        # This error indicates that Python's HTML parser sucks.
+        msg = "Got an error parsing HTML."
+
+        # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
+        if sys.version_info[:3] < (2,7,3):
+            msg += " Use Python 2.7.3 or newer for better HTML parsing."
+
+        tty.warn(msg, url, "HTMLParseError: " + str(e))
 
     except Exception, e:
-        # Other types of errors are completely ignored.
-        pass
+        pass    # Other types of errors are completely ignored.
 
     return pages
 
@@ -141,5 +151,5 @@ def get_pages(root_url, **kwargs):
        performance over a sequential fetch.
     """
     max_depth = kwargs.setdefault('depth', 1)
-    pages =  _spider((root_url, 1, max_depth))
+    pages =  _spider((root_url, 1, max_depth, False))
     return pages
diff --git a/lib/spack/spack/version.py b/lib/spack/spack/version.py
index 1f44c5f39b..0b5125fdf0 100644
--- a/lib/spack/spack/version.py
+++ b/lib/spack/spack/version.py
@@ -152,21 +152,24 @@ class Version(object):
                 return r'[a-zA-Z]+'
 
         version = self.version
-        separators = ('',) + self.separators
+
+        # Use a wildcard for separators, in case a version is written
+        # two different ways (e.g., boost writes 1_55_0 and 1.55.0)
+        sep_re = '[_.-]'
+        separators = ('',) + (sep_re,) * len(self.separators)
 
         version += (version[-1],) * 2
-        separators += (separators[-1],) * 2
+        separators += (sep_re,) * 2
 
-        sep_res = [re.escape(sep) for sep in separators]
-        seg_res = [a_or_n(seg) for seg in version]
+        segments = [a_or_n(seg) for seg in version]
 
-        wc = seg_res[0]
-        for i in xrange(1, len(sep_res)):
-            wc += '(?:' + sep_res[i] + seg_res[i]
+        wc = segments[0]
+        for i in xrange(1, len(separators)):
+            wc += '(?:' + separators[i] + segments[i]
 
         # Add possible alpha or beta indicator at the end of each segemnt
         # We treat these specially b/c they're so common.
-        wc += '[ab]?)?' * (len(seg_res) - 1)
+        wc += '[ab]?)?' * (len(segments) - 1)
         return wc
 
 
-- 
cgit v1.2.3-60-g2f50