6 files changed, 253 insertions, 108 deletions
diff --git a/lib/spack/spack/cmd/create.py b/lib/spack/spack/cmd/create.py
index 5e42860f3e..9ecb709110 100644
--- a/lib/spack/spack/cmd/create.py
+++ b/lib/spack/spack/cmd/create.py
@@ -34,8 +34,8 @@ from llnl.util.filesystem import mkdirp
 import spack
 import spack.cmd
 import spack.cmd.checksum
-import spack.package
 import spack.url
+import spack.util.web
 from spack.util.naming import *
 import spack.util.crypto as crypto
 
@@ -166,7 +166,7 @@ def create(parser, args):
     tty.msg("This looks like a URL for %s version %s." % (name, version))
     tty.msg("Creating template for package %s" % name)
 
-    versions = spack.package.find_versions_of_archive(url)
+    versions = spack.util.web.find_versions_of_archive(url)
     rkeys = sorted(versions.keys(), reverse=True)
     versions = OrderedDict(zip(rkeys, (versions[v] for v in rkeys)))
 
diff --git a/lib/spack/spack/cmd/url-parse.py b/lib/spack/spack/cmd/url-parse.py
new file mode 100644
index 0000000000..077c793d2e
--- /dev/null
+++ b/lib/spack/spack/cmd/url-parse.py
@@ -0,0 +1,75 @@
+##############################################################################
+# Copyright (c) 2013, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+#
+# This file is part of Spack.
+# Written by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
+# LLNL-CODE-647188
+#
+# For details, see https://github.com/llnl/spack
+# Please also see the LICENSE file for our notice and the LGPL.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License (as published by
+# the Free Software Foundation) version 2.1 dated February 1999.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
+# conditions of the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+##############################################################################
+import sys
+
+import llnl.util.tty as tty
+
+import spack
+import spack.url
+from spack.util.web import find_versions_of_archive
+
+description = "Show parsing of a URL, optionally spider web for other versions."
+
+def setup_parser(subparser):
+    subparser.add_argument('url', help="url of a package archive")
+    subparser.add_argument(
+        '-s', '--spider', action='store_true', help="Spider the source page for versions.")
+
+
+def print_name_and_version(url):
+    name, ns, nl, ntup, ver, vs, vl, vtup = spack.url.substitution_offsets(url)
+    underlines = [" "] * max(ns+nl, vs+vl)
+    for i in range(ns, ns+nl):
+        underlines[i] = '-'
+    for i in range(vs, vs+vl):
+        underlines[i] = '~'
+
+    print "    %s" % url
+    print "    %s" % ''.join(underlines)
+
+
+def url_parse(parser, args):
+    url = args.url
+
+    ver,  vs, vl = spack.url.parse_version_offset(url)
+    name, ns, nl = spack.url.parse_name_offset(url, ver)
+
+    tty.msg("Parsing URL:")
+    try:
+        print_name_and_version(url)
+    except spack.url.UrlParseError as e:
+        tty.error(str(e))
+
+    print
+    tty.msg("Substituting version 9.9.9b:")
+    newurl = spack.url.substitute_version(url, '9.9.9b')
+    print_name_and_version(newurl)
+
+    if args.spider:
+        print
+        tty.msg("Spidering for versions:")
+        versions = find_versions_of_archive(url)
+        for v in sorted(versions):
+            print "%-20s%s" % (v, versions[v])
diff --git a/lib/spack/spack/fetch_strategy.py b/lib/spack/spack/fetch_strategy.py
index a9374fb34b..0657146bf6 100644
--- a/lib/spack/spack/fetch_strategy.py
+++ b/lib/spack/spack/fetch_strategy.py
@@ -687,7 +687,7 @@ def for_package_version(pkg, version):
 
 
 class FetchError(spack.error.SpackError):
-    def __init__(self, msg, long_msg):
+    def __init__(self, msg, long_msg=None):
         super(FetchError, self).__init__(msg, long_msg)
 
 
@@ -705,7 +705,7 @@ class NoArchiveFileError(FetchError):
 
 
 class NoDigestError(FetchError):
-    def __init__(self, msg, long_msg):
+    def __init__(self, msg, long_msg=None):
         super(NoDigestError, self).__init__(msg, long_msg)
 
 
diff --git a/lib/spack/spack/package.py b/lib/spack/spack/package.py
index b95afb073d..84bcb15f7f 100644
--- a/lib/spack/spack/package.py
+++ b/lib/spack/spack/package.py
@@ -733,9 +733,10 @@ class Package(object):
 
         # Construct paths to special files in the archive dir used to
         # keep track of whether patches were successfully applied.
-        archive_dir = self.stage.source_path
-        good_file = join_path(archive_dir, '.spack_patched')
-        bad_file  = join_path(archive_dir, '.spack_patch_failed')
+        archive_dir     = self.stage.source_path
+        good_file       = join_path(archive_dir, '.spack_patched')
+        no_patches_file = join_path(archive_dir, '.spack_no_patches')
+        bad_file        = join_path(archive_dir, '.spack_patch_failed')
 
         # If we encounter an archive that failed to patch, restage it
         # so that we can apply all the patches again.
@@ -749,29 +750,46 @@ class Package(object):
         if os.path.isfile(good_file):
             tty.msg("Already patched %s" % self.name)
             return
+        elif os.path.isfile(no_patches_file):
+            tty.msg("No patches needed for %s." % self.name)
+            return
 
         # Apply all the patches for specs that match this one
+        patched = False
         for spec, patch_list in self.patches.items():
             if self.spec.satisfies(spec):
                 for patch in patch_list:
-                    tty.msg('Applying patch %s' % patch.path_or_url)
                     try:
                         patch.apply(self.stage)
+                        tty.msg('Applied patch %s' % patch.path_or_url)
+                        patched = True
                     except:
                         # Touch bad file if anything goes wrong.
+                        tty.msg('Patch %s failed.' % patch.path_or_url)
                         touch(bad_file)
                         raise
 
-        # patch succeeded.  Get rid of failed file & touch good file so we
-        # don't try to patch again again next time.
+        if has_patch_fun:
+            try:
+                self.patch()
+                tty.msg("Ran patch() for %s." % self.name)
+                patched = True
+            except:
+                tty.msg("patch() function failed for %s." % self.name)
+                touch(bad_file)
+                raise
+
+        # Get rid of any old failed file -- patches have either succeeded
+        # or are not needed.  This is mostly defensive -- it's needed
+        # if the restage() method doesn't clean *everything* (e.g., for a repo)
         if os.path.isfile(bad_file):
             os.remove(bad_file)
-        touch(good_file)
-
-        if has_patch_fun:
-            self.patch()
 
-        tty.msg("Patched %s" % self.name)
+        # touch good or no patches file so that we skip next time.
+        if patched:
+            touch(good_file)
+        else:
+            touch(no_patches_file)
 
 
     def do_fake_install(self):
@@ -1164,7 +1182,7 @@ class Package(object):
             raise VersionFetchError(self.__class__)
 
         try:
-            return find_versions_of_archive(
+            return spack.util.web.find_versions_of_archive(
                 *self.all_urls, list_url=self.list_url, list_depth=self.list_depth)
         except spack.error.NoNetworkConnectionError, e:
             tty.die("Package.fetch_versions couldn't connect to:",
@@ -1188,49 +1206,6 @@ class Package(object):
         return " ".join("-Wl,-rpath=%s" % p for p in self.rpath)
 
 
-def find_versions_of_archive(*archive_urls, **kwargs):
-    list_url   = kwargs.get('list_url', None)
-    list_depth = kwargs.get('list_depth', 1)
-
-    # Generate a list of list_urls based on archive urls and any
-    # explicitly listed list_url in the package
-    list_urls = set()
-    if list_url:
-        list_urls.add(list_url)
-    for aurl in archive_urls:
-        list_urls.add(spack.url.find_list_url(aurl))
-
-    # Grab some web pages to scrape.
-    page_map = {}
-    for lurl in list_urls:
-        pages = spack.util.web.get_pages(lurl, depth=list_depth)
-        page_map.update(pages)
-
-    # Scrape them for archive URLs
-    regexes = []
-    for aurl in archive_urls:
-        # This creates a regex from the URL with a capture group for
-        # the version part of the URL.  The capture group is converted
-        # to a generic wildcard, so we can use this to extract things
-        # on a page that look like archive URLs.
-        url_regex = spack.url.wildcard_version(aurl)
-
-        # We'll be a bit more liberal and just look for the archive
-        # part, not the full path.
-        regexes.append(os.path.basename(url_regex))
-
-    # Build a version list from all the matches we find
-    versions = {}
-    for page_url, content in page_map.iteritems():
-        # extract versions from matches.
-        for regex in regexes:
-            versions.update(
-                (Version(m.group(1)), urljoin(page_url, m.group(0)))
-                for m in re.finditer(regex, content))
-
-    return versions
-
-
 def validate_package_url(url_string):
     """Determine whether spack can handle a particular URL or not."""
     url = urlparse(url_string)
diff --git a/lib/spack/spack/stage.py b/lib/spack/spack/stage.py
index 754344fc01..76ca7273cb 100644
--- a/lib/spack/spack/stage.py
+++ b/lib/spack/spack/stage.py
@@ -82,14 +82,18 @@ class Stage(object):
                  stage object later).  If name is not provided, then this
                  stage will be given a unique name automatically.
         """
+        # TODO: fetch/stage coupling needs to be reworked -- the logic
+        # TODO: here is convoluted and not modular enough.
         if isinstance(url_or_fetch_strategy, basestring):
             self.fetcher = fs.from_url(url_or_fetch_strategy)
         elif isinstance(url_or_fetch_strategy, fs.FetchStrategy):
             self.fetcher = url_or_fetch_strategy
         else:
             raise ValueError("Can't construct Stage without url or fetch strategy")
-
         self.fetcher.set_stage(self)
+        self.default_fetcher = self.fetcher  # self.fetcher can change with mirrors.
+        self.skip_checksum_for_mirror = True # used for mirrored archives of repositories.
+
         self.name = kwargs.get('name')
         self.mirror_path = kwargs.get('mirror_path')
 
@@ -198,17 +202,18 @@ class Stage(object):
     @property
     def archive_file(self):
         """Path to the source archive within this stage directory."""
-        if not isinstance(self.fetcher, fs.URLFetchStrategy):
-            return None
+        paths = []
+        if isinstance(self.fetcher, fs.URLFetchStrategy):
+            paths.append(os.path.join(self.path, os.path.basename(self.fetcher.url)))
 
-        paths = [os.path.join(self.path, os.path.basename(self.fetcher.url))]
         if self.mirror_path:
             paths.append(os.path.join(self.path, os.path.basename(self.mirror_path)))
 
         for path in paths:
             if os.path.exists(path):
                 return path
-        return None
+        else:
+            return None
 
 
     @property
@@ -238,23 +243,34 @@ class Stage(object):
         """Downloads an archive or checks out code from a repository."""
         self.chdir()
 
-        fetchers = [self.fetcher]
+        fetchers = [self.default_fetcher]
 
         # TODO: move mirror logic out of here and clean it up!
+        # TODO: Or @alalazo may have some ideas about how to use a
+        # TODO: CompositeFetchStrategy here.
+        self.skip_checksum_for_mirror = True
         if self.mirror_path:
             urls = ["%s/%s" % (m, self.mirror_path) for m in _get_mirrors()]
 
+            # If this archive is normally fetched from a tarball URL,
+            # then use the same digest.  `spack mirror` ensures that
+            # the checksum will be the same.
             digest = None
-            if isinstance(self.fetcher, fs.URLFetchStrategy):
-                digest = self.fetcher.digest
-            fetchers = [fs.URLFetchStrategy(url, digest)
-                        for url in urls] + fetchers
-            for f in fetchers:
-                f.set_stage(self)
+            if isinstance(self.default_fetcher, fs.URLFetchStrategy):
+                digest = self.default_fetcher.digest
+
+            # Have to skip the checkesum for things archived from
+            # repositories.  How can this be made safer?
+            self.skip_checksum_for_mirror = not bool(digest)
+
+            for url in urls:
+                fetchers.insert(0, fs.URLFetchStrategy(url, digest))
 
         for fetcher in fetchers:
             try:
-                fetcher.fetch()
+                fetcher.set_stage(self)
+                self.fetcher = fetcher
+                self.fetcher.fetch()
                 break
             except spack.error.SpackError, e:
                 tty.msg("Fetching from %s failed." % fetcher)
@@ -262,13 +278,22 @@ class Stage(object):
                 continue
         else:
             errMessage = "All fetchers failed for %s" % self.name
+            self.fetcher = self.default_fetcher
             raise fs.FetchError(errMessage, None)
 
 
     def check(self):
         """Check the downloaded archive against a checksum digest.
            No-op if this stage checks code out of a repository."""
-        self.fetcher.check()
+        if self.fetcher is not self.default_fetcher and self.skip_checksum_for_mirror:
+            tty.warn("Fetching from mirror without a checksum!",
+                     "This package is normally checked out from a version "
+                     "control system, but it has been archived on a spack "
+                     "mirror.  This means we cannot know a checksum for the "
+                     "tarball in advance. Be sure that your connection to "
+                     "this mirror is secure!.")
+        else:
+            self.fetcher.check()
 
 
     def expand_archive(self):
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index 94384e9c86..e26daef296 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -23,6 +23,7 @@
 # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 ##############################################################################
 import re
+import os
 import sys
 import subprocess
 import urllib2, cookielib
@@ -70,7 +71,9 @@ def _spider(args):
     """
     url, visited, root, opener, depth, max_depth, raise_on_error = args
 
-    pages = {}
+    pages = {}     # dict from page URL -> text content.
+    links = set()  # set of all links seen on visited pages.
+
     try:
         # Make a HEAD request first to check the content type.  This lets
         # us ignore tarballs and gigantic files.
@@ -99,42 +102,45 @@ def _spider(args):
         page = response.read()
         pages[response_url] = page
 
-        # If we're not at max depth, parse out the links in the page
-        if depth < max_depth:
-            link_parser = LinkParser()
-            subcalls = []
-            link_parser.feed(page)
-
-            while link_parser.links:
-                raw_link = link_parser.links.pop()
+        # Parse out the links in the page
+        link_parser = LinkParser()
+        subcalls = []
+        link_parser.feed(page)
 
-                # Skip stuff that looks like an archive
-                if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
-                    continue
+        while link_parser.links:
+            raw_link = link_parser.links.pop()
+            abs_link = urlparse.urljoin(response_url, raw_link)
 
-                # Evaluate the link relative to the page it came from.
-                abs_link = urlparse.urljoin(response_url, raw_link)
+            links.add(abs_link)
 
-                # Skip things outside the root directory
-                if not abs_link.startswith(root):
-                    continue
+            # Skip stuff that looks like an archive
+            if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
+                continue
 
-                # Skip already-visited links
-                if abs_link in visited:
-                    continue
+            # Skip things outside the root directory
+            if not abs_link.startswith(root):
+                continue
 
-                subcalls.append((abs_link, visited, root, None, depth+1, max_depth, raise_on_error))
-                visited.add(abs_link)
+            # Skip already-visited links
+            if abs_link in visited:
+                continue
 
-            if subcalls:
-                try:
-                    pool = Pool(processes=len(subcalls))
-                    dicts = pool.map(_spider, subcalls)
-                    for d in dicts:
-                        pages.update(d)
-                finally:
-                    pool.terminate()
-                    pool.join()
+        # If we're not at max depth, follow links.
+        if depth < max_depth:
+            subcalls.append((abs_link, visited, root, None,
+                             depth+1, max_depth, raise_on_error))
+            visited.add(abs_link)
+
+        if subcalls:
+            try:
+                pool = Pool(processes=len(subcalls))
+                results = pool.map(_spider, subcalls)
+                for sub_pages, sub_links in results:
+                    pages.update(sub_pages)
+                    links.update(sub_links)
+            finally:
+                pool.terminate()
+                pool.join()
 
     except urllib2.URLError, e:
         tty.debug(e)
@@ -155,10 +161,10 @@ def _spider(args):
         # Other types of errors are completely ignored, except in debug mode.
         tty.debug("Error in _spider: %s" % e)
 
-    return pages
+    return pages, links
 
 
-def get_pages(root_url, **kwargs):
+def spider(root_url, **kwargs):
     """Gets web pages from a root URL.
        If depth is specified (e.g., depth=2), then this will also fetches pages
        linked from the root and its children up to depth.
@@ -167,5 +173,69 @@ def get_pages(root_url, **kwargs):
        performance over a sequential fetch.
     """
     max_depth = kwargs.setdefault('depth', 1)
-    pages =  _spider((root_url, set(), root_url, None, 1, max_depth, False))
-    return pages
+    pages, links =  _spider((root_url, set(), root_url, None, 1, max_depth, False))
+    return pages, links
+
+
+def find_versions_of_archive(*archive_urls, **kwargs):
+    """Scrape web pages for new versions of a tarball.
+
+    Arguments:
+      archive_urls:
+          URLs for different versions of a package. Typically these
+          are just the tarballs from the package file itself.  By
+          default, this searches the parent directories of archives.
+
+    Keyword Arguments:
+      list_url:
+
+          URL for a listing of archives.  Spack wills scrape these
+          pages for download links that look like the archive URL.
+
+      list_depth:
+          Max depth to follow links on list_url pages.
+
+    """
+    list_url   = kwargs.get('list_url', None)
+    list_depth = kwargs.get('list_depth', 1)
+
+    # Generate a list of list_urls based on archive urls and any
+    # explicitly listed list_url in the package
+    list_urls = set()
+    if list_url:
+        list_urls.add(list_url)
+    for aurl in archive_urls:
+        list_urls.add(spack.url.find_list_url(aurl))
+
+    # Grab some web pages to scrape.
+    pages = {}
+    links = set()
+    for lurl in list_urls:
+        p, l = spider(lurl, depth=list_depth)
+        pages.update(p)
+        links.update(l)
+
+    # Scrape them for archive URLs
+    regexes = []
+    for aurl in archive_urls:
+        # This creates a regex from the URL with a capture group for
+        # the version part of the URL.  The capture group is converted
+        # to a generic wildcard, so we can use this to extract things
+        # on a page that look like archive URLs.
+        url_regex = spack.url.wildcard_version(aurl)
+
+        # We'll be a bit more liberal and just look for the archive
+        # part, not the full path.
+        regexes.append(os.path.basename(url_regex))
+
+    # Build a dict version -> URL from any links that match the wildcards.
+    versions = {}
+    for url in links:
+        if any(re.search(r, url) for r in regexes):
+            try:
+                ver = spack.url.parse_version(url)
+                versions[ver] = url
+            except spack.url.UndetectableVersionError as e:
+                continue
+
+    return versions