Add better tests for web.py; fix some bugs found with spidering.

- _spider in web.py was actually failing to spider deeper than a certain point. - Fixed multiprocessing pools to not use daemons and to allow recursive spawning. - Added detailed tests for spidering and for finding archive versions. - left some xfail URL finding exercises for the reader. - Fix noqa annotations for some @when decorators
author: Todd Gamblin <tgamblin@llnl.gov> 2017-04-01 14:03:54 -0700
committer: Todd Gamblin <tgamblin@llnl.gov> 2017-04-01 15:10:45 -0700
commit: 221f17971634e43950f90333776589c4d0bf0ce3 (patch)
tree: 16edb80358c84baaf24c14a28304dfff25f71b04 /lib
parent: 28d6d375b46273779e377b844ddb80739f393514 (diff)
download: spack-221f17971634e43950f90333776589c4d0bf0ce3.tar.gz
spack-221f17971634e43950f90333776589c4d0bf0ce3.tar.bz2
spack-221f17971634e43950f90333776589c4d0bf0ce3.tar.xz
spack-221f17971634e43950f90333776589c4d0bf0ce3.zip
8 files changed, 283 insertions, 35 deletions
diff --git a/lib/spack/spack/package.py b/lib/spack/spack/package.py
index 6b71c0f1a9..177b4c908b 100644
--- a/lib/spack/spack/package.py
+++ b/lib/spack/spack/package.py
@@ -570,7 +570,7 @@ class PackageBase(with_metaclass(PackageMeta, object)):
             self.list_url = None
 
         if not hasattr(self, 'list_depth'):
-            self.list_depth = 1
+            self.list_depth = 0
 
         # Set default licensing information
         if not hasattr(self, 'license_required'):
@@ -966,6 +966,10 @@ class PackageBase(with_metaclass(PackageMeta, object)):
         self.stage.expand_archive()
         self.stage.chdir_to_source()
 
+    def patch(self):
+        """Default patch implementation is a no-op."""
+        pass
+
     def do_patch(self):
         """Calls do_stage(), then applied patches to the expanded tarball if they
            haven't been applied already."""
@@ -1686,9 +1690,7 @@ class PackageBase(with_metaclass(PackageMeta, object)):
 
         try:
             return spack.util.web.find_versions_of_archive(
-                *self.all_urls,
-                list_url=self.list_url,
-                list_depth=self.list_depth)
+                self.all_urls, self.list_url, self.list_depth)
         except spack.error.NoNetworkConnectionError as e:
             tty.die("Package.fetch_versions couldn't connect to:", e.url,
                     e.message)
diff --git a/lib/spack/spack/test/data/web/1.html b/lib/spack/spack/test/data/web/1.html
new file mode 100644
index 0000000000..ef49c38cdb
--- /dev/null
+++ b/lib/spack/spack/test/data/web/1.html
@@ -0,0 +1,10 @@
+<html>
+  <head>
+    This is page 1.
+  </head>
+  <body>
+    <a href="2.html">list_depth=2 follows this.</a>
+
+    <a href="foo-1.0.0.tar.gz">foo-1.0.0.tar.gz</a>
+  </body>
+</html>
diff --git a/lib/spack/spack/test/data/web/2.html b/lib/spack/spack/test/data/web/2.html
new file mode 100644
index 0000000000..64c843f25b
--- /dev/null
+++ b/lib/spack/spack/test/data/web/2.html
@@ -0,0 +1,12 @@
+<html>
+  <head>
+    This is page 2.
+  </head>
+  <body>
+    <a href="3.html">list_depth=3 follows this.</a>
+    <a href="4.html">list_depth=3 follows this too.</a>
+
+    <a href="foo-2.0.0.tar.gz">foo-2.0.0.tar.gz</a>
+    <a href="foo-2.0.0b2.tar.gz">foo-2.0.0b2.tar.gz</a>
+  </body>
+</html>
diff --git a/lib/spack/spack/test/data/web/3.html b/lib/spack/spack/test/data/web/3.html
new file mode 100644
index 0000000000..e530206035
--- /dev/null
+++ b/lib/spack/spack/test/data/web/3.html
@@ -0,0 +1,11 @@
+<html>
+  <head>
+    This is page 3.
+  </head>
+  <body>
+    <a href="index.html">This link is already visited.</a>
+
+    <a href="foo-3.0.tar.gz">foo-3.0.tar.gz</a>
+    <a href="foo-3.0a1.tar.gz">foo-3.0a1.tar.gz</a>
+  </body>
+</html>
diff --git a/lib/spack/spack/test/data/web/4.html b/lib/spack/spack/test/data/web/4.html
new file mode 100644
index 0000000000..b5fe850f4d
--- /dev/null
+++ b/lib/spack/spack/test/data/web/4.html
@@ -0,0 +1,11 @@
+<html>
+  <head>
+    This is page 4.
+  </head>
+  <body>
+    This page is terminal and has no links to other pages.
+
+    <a href="foo-4.5.tar.gz">foo-4.5.tar.gz.</a>
+    <a href="foo-4.5-rc5.tar.gz">foo-4.1-rc5.tar.gz.</a>
+  </body>
+</html>
diff --git a/lib/spack/spack/test/data/web/index.html b/lib/spack/spack/test/data/web/index.html
new file mode 100644
index 0000000000..3985deeb35
--- /dev/null
+++ b/lib/spack/spack/test/data/web/index.html
@@ -0,0 +1,10 @@
+<html>
+  <head>
+    This is the root page.
+  </head>
+  <body>
+    <a href="1.html">list_depth=1 follows this.</a>
+
+    <a href="foo-0.0.0.tar.gz">foo-0.0.0.tar.gz</a>
+  </body>
+</html>
diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py
new file mode 100644
index 0000000000..9a7f4d9f8b
--- /dev/null
+++ b/lib/spack/spack/test/web.py
@@ -0,0 +1,165 @@
+##############################################################################
+# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+#
+# This file is part of Spack.
+# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
+# LLNL-CODE-647188
+#
+# For details, see https://github.com/llnl/spack
+# Please also see the LICENSE file for our notice and the LGPL.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License (as
+# published by the Free Software Foundation) version 2.1, February 1999.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
+# conditions of the GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+##############################################################################
+"""Tests for web.py."""
+import pytest
+import os
+
+import spack
+from spack.util.web import spider, find_versions_of_archive
+from spack.version import *
+
+
+web_data_path = os.path.join(spack.test_path, 'data', 'web')
+
+root = 'file://' + web_data_path + '/index.html'
+root_tarball = 'file://' + web_data_path + '/foo-0.0.0.tar.gz'
+
+page_1 = 'file://' + os.path.join(web_data_path, '1.html')
+page_2 = 'file://' + os.path.join(web_data_path, '2.html')
+page_3 = 'file://' + os.path.join(web_data_path, '3.html')
+page_4 = 'file://' + os.path.join(web_data_path, '4.html')
+
+
+def test_spider_0():
+    pages, links = spider(root, depth=0)
+
+    assert root in pages
+    assert page_1 not in pages
+    assert page_2 not in pages
+    assert page_3 not in pages
+    assert page_4 not in pages
+
+    assert "This is the root page." in pages[root]
+
+    assert root not in links
+    assert page_1 in links
+    assert page_2 not in links
+    assert page_3 not in links
+    assert page_4 not in links
+
+
+def test_spider_1():
+    pages, links = spider(root, depth=1)
+
+    assert root in pages
+    assert page_1 in pages
+    assert page_2 not in pages
+    assert page_3 not in pages
+    assert page_4 not in pages
+
+    assert "This is the root page." in pages[root]
+    assert "This is page 1." in pages[page_1]
+
+    assert root not in links
+    assert page_1 in links
+    assert page_2 in links
+    assert page_3 not in links
+    assert page_4 not in links
+
+
+def test_spider_2():
+    pages, links = spider(root, depth=2)
+
+    assert root in pages
+    assert page_1 in pages
+    assert page_2 in pages
+    assert page_3 not in pages
+    assert page_4 not in pages
+
+    assert "This is the root page." in pages[root]
+    assert "This is page 1." in pages[page_1]
+    assert "This is page 2." in pages[page_2]
+
+    assert root not in links
+    assert page_1 in links
+    assert page_1 in links
+    assert page_2 in links
+    assert page_3 in links
+    assert page_4 in links
+
+
+def test_spider_3():
+    pages, links = spider(root, depth=3)
+
+    assert root in pages
+    assert page_1 in pages
+    assert page_2 in pages
+    assert page_3 in pages
+    assert page_4 in pages
+
+    assert "This is the root page." in pages[root]
+    assert "This is page 1." in pages[page_1]
+    assert "This is page 2." in pages[page_2]
+    assert "This is page 3." in pages[page_3]
+    assert "This is page 4." in pages[page_4]
+
+    assert root in links  # circular link on page 3
+    assert page_1 in links
+    assert page_1 in links
+    assert page_2 in links
+    assert page_3 in links
+    assert page_4 in links
+
+
+def test_find_versions_of_archive_0():
+    versions = find_versions_of_archive(root_tarball, root, list_depth=0)
+    assert ver('0.0.0') in versions
+
+
+def test_find_versions_of_archive_1():
+    versions = find_versions_of_archive(root_tarball, root, list_depth=1)
+    assert ver('0.0.0') in versions
+    assert ver('1.0.0') in versions
+
+
+def test_find_versions_of_archive_2():
+    versions = find_versions_of_archive(root_tarball, root, list_depth=2)
+    assert ver('0.0.0') in versions
+    assert ver('1.0.0') in versions
+    assert ver('2.0.0') in versions
+
+
+@pytest.mark.xfail
+def test_find_exotic_versions_of_archive_2():
+    versions = find_versions_of_archive(root_tarball, root, list_depth=2)
+    # up for grabs to make this better.
+    assert ver('2.0.0b2') in versions
+
+
+def test_find_versions_of_archive_3():
+    versions = find_versions_of_archive(root_tarball, root, list_depth=3)
+    assert ver('0.0.0') in versions
+    assert ver('1.0.0') in versions
+    assert ver('2.0.0') in versions
+    assert ver('3.0') in versions
+    assert ver('4.5') in versions
+
+
+@pytest.mark.xfail
+def test_find_exotic_versions_of_archive_3():
+    versions = find_versions_of_archive(root_tarball, root, list_depth=3)
+    assert ver('2.0.0b2') in versions
+    assert ver('3.0a1') in versions
+    assert ver('4.5-rc5') in versions
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index 09bf2c34e1..8e2dd34635 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -25,11 +25,12 @@
 import re
 import os
 import sys
+import traceback
 
 from six.moves.urllib.request import urlopen, Request
 from six.moves.urllib.error import URLError
 from six.moves.urllib.parse import urljoin
-from multiprocessing import Pool
+import multiprocessing.pool
 
 try:
     # Python 2 had these in the HTMLParser package.
@@ -67,25 +68,42 @@ class LinkParser(HTMLParser):
                     self.links.append(val)
 
 
-def _spider(args):
-    """_spider(url, depth, max_depth)
+class NonDaemonProcess(multiprocessing.Process):
+    """Process tha allows sub-processes, so pools can have sub-pools."""
+    def _get_daemon(self):
+        return False
 
-       Fetches URL and any pages it links to up to max_depth.  depth should
-       initially be 1, and max_depth includes the root.  This function will
-       print out a warning only if the root can't be fetched; it ignores
-       errors with pages that the root links to.
+    def _set_daemon(self, value):
+        pass
 
-       This will return a list of the pages fetched, in no particular order.
+    daemon = property(_get_daemon, _set_daemon)
+
+
+class NonDaemonPool(multiprocessing.pool.Pool):
+    """Pool that uses non-daemon processes"""
+    Process = NonDaemonProcess
 
-       Takes args as a tuple b/c it's intended to be used by a multiprocessing
-       pool.  Firing off all the child links at once makes the fetch MUCH
-       faster for pages with lots of children.
-    """
-    url, visited, root, opener, depth, max_depth, raise_on_error = args
 
+def _spider(url, visited, root, depth, max_depth, raise_on_error):
+    """Fetches URL and any pages it links to up to max_depth.
+
+       depth should initially be zero, and max_depth is the max depth of
+       links to follow from the root.
+
+       Prints out a warning only if the root can't be fetched; it ignores
+       errors with pages that the root links to.
+
+       Returns a tuple of:
+       - pages: dict of pages visited (URL) mapped to their full text.
+       - links: set of links encountered while visiting the pages.
+    """
     pages = {}     # dict from page URL -> text content.
     links = set()  # set of all links seen on visited pages.
 
+    # root may end with index.html -- chop that off.
+    if root.endswith('/index.html'):
+        root = re.sub('/index.html$', '', root)
+
     try:
         # Make a HEAD request first to check the content type.  This lets
         # us ignore tarballs and gigantic files.
@@ -139,17 +157,19 @@ def _spider(args):
 
             # If we're not at max depth, follow links.
             if depth < max_depth:
-                subcalls.append((abs_link, visited, root, None,
+                subcalls.append((abs_link, visited, root,
                                  depth + 1, max_depth, raise_on_error))
                 visited.add(abs_link)
 
         if subcalls:
+            pool = NonDaemonPool(processes=len(subcalls))
             try:
-                pool = Pool(processes=len(subcalls))
-                results = pool.map(_spider, subcalls)
+                results = pool.map(_spider_wrapper, subcalls)
+
                 for sub_pages, sub_links in results:
                     pages.update(sub_pages)
                     links.update(sub_links)
+
             finally:
                 pool.terminate()
                 pool.join()
@@ -171,46 +191,53 @@ def _spider(args):
 
     except Exception as e:
         # Other types of errors are completely ignored, except in debug mode.
-        tty.debug("Error in _spider: %s" % e)
+        tty.debug("Error in _spider: %s:%s" % (type(e), e),
+                  traceback.format_exc())
 
     return pages, links
 
 
-def spider(root_url, **kwargs):
+def _spider_wrapper(args):
+    """Wrapper for using spider with multiprocessing."""
+    return _spider(*args)
+
+
+def spider(root_url, depth=0):
+
     """Gets web pages from a root URL.
-       If depth is specified (e.g., depth=2), then this will also fetches pages
-       linked from the root and its children up to depth.
+
+       If depth is specified (e.g., depth=2), then this will also follow
+       up to <depth> levels of links from the root.
 
        This will spawn processes to fetch the children, for much improved
        performance over a sequential fetch.
+
     """
-    max_depth = kwargs.setdefault('depth', 1)
-    pages, links = _spider((root_url, set(), root_url, None,
-                            1, max_depth, False))
+    pages, links = _spider(root_url, set(), root_url, 0, depth, False)
     return pages, links
 
 
-def find_versions_of_archive(*archive_urls, **kwargs):
+def find_versions_of_archive(archive_urls, list_url=None, list_depth=0):
     """Scrape web pages for new versions of a tarball.
 
     Arguments:
       archive_urls:
-          URLs for different versions of a package. Typically these
-          are just the tarballs from the package file itself.  By
-          default, this searches the parent directories of archives.
+          URL or sequence of URLs for different versions of a
+          package. Typically these are just the tarballs from the package
+          file itself.  By default, this searches the parent directories
+          of archives.
 
     Keyword Arguments:
       list_url:
-
           URL for a listing of archives.  Spack wills scrape these
           pages for download links that look like the archive URL.
 
       list_depth:
-          Max depth to follow links on list_url pages.
+          Max depth to follow links on list_url pages. Default 0.
 
     """
-    list_url   = kwargs.get('list_url', None)
-    list_depth = kwargs.get('list_depth', 1)
+    if not isinstance(archive_urls, (list, tuple)):
+        archive_urls = [archive_urls]
 
     # Generate a list of list_urls based on archive urls and any
     # explicitly listed list_url in the package
author	Todd Gamblin <tgamblin@llnl.gov>	2017-04-01 14:03:54 -0700
committer	Todd Gamblin <tgamblin@llnl.gov>	2017-04-01 15:10:45 -0700
commit	221f17971634e43950f90333776589c4d0bf0ce3 (patch)
tree	16edb80358c84baaf24c14a28304dfff25f71b04 /lib
parent	28d6d375b46273779e377b844ddb80739f393514 (diff)
download	spack-221f17971634e43950f90333776589c4d0bf0ce3.tar.gz spack-221f17971634e43950f90333776589c4d0bf0ce3.tar.bz2 spack-221f17971634e43950f90333776589c4d0bf0ce3.tar.xz spack-221f17971634e43950f90333776589c4d0bf0ce3.zip