diff options
author | Todd Gamblin <tgamblin@llnl.gov> | 2017-04-01 14:03:54 -0700 |
---|---|---|
committer | Todd Gamblin <tgamblin@llnl.gov> | 2017-04-01 15:10:45 -0700 |
commit | 221f17971634e43950f90333776589c4d0bf0ce3 (patch) | |
tree | 16edb80358c84baaf24c14a28304dfff25f71b04 /lib | |
parent | 28d6d375b46273779e377b844ddb80739f393514 (diff) | |
download | spack-221f17971634e43950f90333776589c4d0bf0ce3.tar.gz spack-221f17971634e43950f90333776589c4d0bf0ce3.tar.bz2 spack-221f17971634e43950f90333776589c4d0bf0ce3.tar.xz spack-221f17971634e43950f90333776589c4d0bf0ce3.zip |
Add better tests for web.py; fix some bugs found with spidering.
- _spider in web.py was actually failing to spider deeper than a certain
point.
- Fixed multiprocessing pools to not use daemons and to allow recursive
spawning.
- Added detailed tests for spidering and for finding archive versions.
- left some xfail URL finding exercises for the reader.
- Fix noqa annotations for some @when decorators
Diffstat (limited to 'lib')
-rw-r--r-- | lib/spack/spack/package.py | 10 | ||||
-rw-r--r-- | lib/spack/spack/test/data/web/1.html | 10 | ||||
-rw-r--r-- | lib/spack/spack/test/data/web/2.html | 12 | ||||
-rw-r--r-- | lib/spack/spack/test/data/web/3.html | 11 | ||||
-rw-r--r-- | lib/spack/spack/test/data/web/4.html | 11 | ||||
-rw-r--r-- | lib/spack/spack/test/data/web/index.html | 10 | ||||
-rw-r--r-- | lib/spack/spack/test/web.py | 165 | ||||
-rw-r--r-- | lib/spack/spack/util/web.py | 89 |
8 files changed, 283 insertions, 35 deletions
diff --git a/lib/spack/spack/package.py b/lib/spack/spack/package.py index 6b71c0f1a9..177b4c908b 100644 --- a/lib/spack/spack/package.py +++ b/lib/spack/spack/package.py @@ -570,7 +570,7 @@ class PackageBase(with_metaclass(PackageMeta, object)): self.list_url = None if not hasattr(self, 'list_depth'): - self.list_depth = 1 + self.list_depth = 0 # Set default licensing information if not hasattr(self, 'license_required'): @@ -966,6 +966,10 @@ class PackageBase(with_metaclass(PackageMeta, object)): self.stage.expand_archive() self.stage.chdir_to_source() + def patch(self): + """Default patch implementation is a no-op.""" + pass + def do_patch(self): """Calls do_stage(), then applied patches to the expanded tarball if they haven't been applied already.""" @@ -1686,9 +1690,7 @@ class PackageBase(with_metaclass(PackageMeta, object)): try: return spack.util.web.find_versions_of_archive( - *self.all_urls, - list_url=self.list_url, - list_depth=self.list_depth) + self.all_urls, self.list_url, self.list_depth) except spack.error.NoNetworkConnectionError as e: tty.die("Package.fetch_versions couldn't connect to:", e.url, e.message) diff --git a/lib/spack/spack/test/data/web/1.html b/lib/spack/spack/test/data/web/1.html new file mode 100644 index 0000000000..ef49c38cdb --- /dev/null +++ b/lib/spack/spack/test/data/web/1.html @@ -0,0 +1,10 @@ +<html> + <head> + This is page 1. + </head> + <body> + <a href="2.html">list_depth=2 follows this.</a> + + <a href="foo-1.0.0.tar.gz">foo-1.0.0.tar.gz</a> + </body> +</html> diff --git a/lib/spack/spack/test/data/web/2.html b/lib/spack/spack/test/data/web/2.html new file mode 100644 index 0000000000..64c843f25b --- /dev/null +++ b/lib/spack/spack/test/data/web/2.html @@ -0,0 +1,12 @@ +<html> + <head> + This is page 2. + </head> + <body> + <a href="3.html">list_depth=3 follows this.</a> + <a href="4.html">list_depth=3 follows this too.</a> + + <a href="foo-2.0.0.tar.gz">foo-2.0.0.tar.gz</a> + <a href="foo-2.0.0b2.tar.gz">foo-2.0.0b2.tar.gz</a> + </body> +</html> diff --git a/lib/spack/spack/test/data/web/3.html b/lib/spack/spack/test/data/web/3.html new file mode 100644 index 0000000000..e530206035 --- /dev/null +++ b/lib/spack/spack/test/data/web/3.html @@ -0,0 +1,11 @@ +<html> + <head> + This is page 3. + </head> + <body> + <a href="index.html">This link is already visited.</a> + + <a href="foo-3.0.tar.gz">foo-3.0.tar.gz</a> + <a href="foo-3.0a1.tar.gz">foo-3.0a1.tar.gz</a> + </body> +</html> diff --git a/lib/spack/spack/test/data/web/4.html b/lib/spack/spack/test/data/web/4.html new file mode 100644 index 0000000000..b5fe850f4d --- /dev/null +++ b/lib/spack/spack/test/data/web/4.html @@ -0,0 +1,11 @@ +<html> + <head> + This is page 4. + </head> + <body> + This page is terminal and has no links to other pages. + + <a href="foo-4.5.tar.gz">foo-4.5.tar.gz.</a> + <a href="foo-4.5-rc5.tar.gz">foo-4.1-rc5.tar.gz.</a> + </body> +</html> diff --git a/lib/spack/spack/test/data/web/index.html b/lib/spack/spack/test/data/web/index.html new file mode 100644 index 0000000000..3985deeb35 --- /dev/null +++ b/lib/spack/spack/test/data/web/index.html @@ -0,0 +1,10 @@ +<html> + <head> + This is the root page. + </head> + <body> + <a href="1.html">list_depth=1 follows this.</a> + + <a href="foo-0.0.0.tar.gz">foo-0.0.0.tar.gz</a> + </body> +</html> diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py new file mode 100644 index 0000000000..9a7f4d9f8b --- /dev/null +++ b/lib/spack/spack/test/web.py @@ -0,0 +1,165 @@ +############################################################################## +# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory. +# +# This file is part of Spack. +# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved. +# LLNL-CODE-647188 +# +# For details, see https://github.com/llnl/spack +# Please also see the LICENSE file for our notice and the LGPL. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License (as +# published by the Free Software Foundation) version 2.1, February 1999. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and +# conditions of the GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +############################################################################## +"""Tests for web.py.""" +import pytest +import os + +import spack +from spack.util.web import spider, find_versions_of_archive +from spack.version import * + + +web_data_path = os.path.join(spack.test_path, 'data', 'web') + +root = 'file://' + web_data_path + '/index.html' +root_tarball = 'file://' + web_data_path + '/foo-0.0.0.tar.gz' + +page_1 = 'file://' + os.path.join(web_data_path, '1.html') +page_2 = 'file://' + os.path.join(web_data_path, '2.html') +page_3 = 'file://' + os.path.join(web_data_path, '3.html') +page_4 = 'file://' + os.path.join(web_data_path, '4.html') + + +def test_spider_0(): + pages, links = spider(root, depth=0) + + assert root in pages + assert page_1 not in pages + assert page_2 not in pages + assert page_3 not in pages + assert page_4 not in pages + + assert "This is the root page." in pages[root] + + assert root not in links + assert page_1 in links + assert page_2 not in links + assert page_3 not in links + assert page_4 not in links + + +def test_spider_1(): + pages, links = spider(root, depth=1) + + assert root in pages + assert page_1 in pages + assert page_2 not in pages + assert page_3 not in pages + assert page_4 not in pages + + assert "This is the root page." in pages[root] + assert "This is page 1." in pages[page_1] + + assert root not in links + assert page_1 in links + assert page_2 in links + assert page_3 not in links + assert page_4 not in links + + +def test_spider_2(): + pages, links = spider(root, depth=2) + + assert root in pages + assert page_1 in pages + assert page_2 in pages + assert page_3 not in pages + assert page_4 not in pages + + assert "This is the root page." in pages[root] + assert "This is page 1." in pages[page_1] + assert "This is page 2." in pages[page_2] + + assert root not in links + assert page_1 in links + assert page_1 in links + assert page_2 in links + assert page_3 in links + assert page_4 in links + + +def test_spider_3(): + pages, links = spider(root, depth=3) + + assert root in pages + assert page_1 in pages + assert page_2 in pages + assert page_3 in pages + assert page_4 in pages + + assert "This is the root page." in pages[root] + assert "This is page 1." in pages[page_1] + assert "This is page 2." in pages[page_2] + assert "This is page 3." in pages[page_3] + assert "This is page 4." in pages[page_4] + + assert root in links # circular link on page 3 + assert page_1 in links + assert page_1 in links + assert page_2 in links + assert page_3 in links + assert page_4 in links + + +def test_find_versions_of_archive_0(): + versions = find_versions_of_archive(root_tarball, root, list_depth=0) + assert ver('0.0.0') in versions + + +def test_find_versions_of_archive_1(): + versions = find_versions_of_archive(root_tarball, root, list_depth=1) + assert ver('0.0.0') in versions + assert ver('1.0.0') in versions + + +def test_find_versions_of_archive_2(): + versions = find_versions_of_archive(root_tarball, root, list_depth=2) + assert ver('0.0.0') in versions + assert ver('1.0.0') in versions + assert ver('2.0.0') in versions + + +@pytest.mark.xfail +def test_find_exotic_versions_of_archive_2(): + versions = find_versions_of_archive(root_tarball, root, list_depth=2) + # up for grabs to make this better. + assert ver('2.0.0b2') in versions + + +def test_find_versions_of_archive_3(): + versions = find_versions_of_archive(root_tarball, root, list_depth=3) + assert ver('0.0.0') in versions + assert ver('1.0.0') in versions + assert ver('2.0.0') in versions + assert ver('3.0') in versions + assert ver('4.5') in versions + + +@pytest.mark.xfail +def test_find_exotic_versions_of_archive_3(): + versions = find_versions_of_archive(root_tarball, root, list_depth=3) + assert ver('2.0.0b2') in versions + assert ver('3.0a1') in versions + assert ver('4.5-rc5') in versions diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index 09bf2c34e1..8e2dd34635 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -25,11 +25,12 @@ import re import os import sys +import traceback from six.moves.urllib.request import urlopen, Request from six.moves.urllib.error import URLError from six.moves.urllib.parse import urljoin -from multiprocessing import Pool +import multiprocessing.pool try: # Python 2 had these in the HTMLParser package. @@ -67,25 +68,42 @@ class LinkParser(HTMLParser): self.links.append(val) -def _spider(args): - """_spider(url, depth, max_depth) +class NonDaemonProcess(multiprocessing.Process): + """Process tha allows sub-processes, so pools can have sub-pools.""" + def _get_daemon(self): + return False - Fetches URL and any pages it links to up to max_depth. depth should - initially be 1, and max_depth includes the root. This function will - print out a warning only if the root can't be fetched; it ignores - errors with pages that the root links to. + def _set_daemon(self, value): + pass - This will return a list of the pages fetched, in no particular order. + daemon = property(_get_daemon, _set_daemon) + + +class NonDaemonPool(multiprocessing.pool.Pool): + """Pool that uses non-daemon processes""" + Process = NonDaemonProcess - Takes args as a tuple b/c it's intended to be used by a multiprocessing - pool. Firing off all the child links at once makes the fetch MUCH - faster for pages with lots of children. - """ - url, visited, root, opener, depth, max_depth, raise_on_error = args +def _spider(url, visited, root, depth, max_depth, raise_on_error): + """Fetches URL and any pages it links to up to max_depth. + + depth should initially be zero, and max_depth is the max depth of + links to follow from the root. + + Prints out a warning only if the root can't be fetched; it ignores + errors with pages that the root links to. + + Returns a tuple of: + - pages: dict of pages visited (URL) mapped to their full text. + - links: set of links encountered while visiting the pages. + """ pages = {} # dict from page URL -> text content. links = set() # set of all links seen on visited pages. + # root may end with index.html -- chop that off. + if root.endswith('/index.html'): + root = re.sub('/index.html$', '', root) + try: # Make a HEAD request first to check the content type. This lets # us ignore tarballs and gigantic files. @@ -139,17 +157,19 @@ def _spider(args): # If we're not at max depth, follow links. if depth < max_depth: - subcalls.append((abs_link, visited, root, None, + subcalls.append((abs_link, visited, root, depth + 1, max_depth, raise_on_error)) visited.add(abs_link) if subcalls: + pool = NonDaemonPool(processes=len(subcalls)) try: - pool = Pool(processes=len(subcalls)) - results = pool.map(_spider, subcalls) + results = pool.map(_spider_wrapper, subcalls) + for sub_pages, sub_links in results: pages.update(sub_pages) links.update(sub_links) + finally: pool.terminate() pool.join() @@ -171,46 +191,53 @@ def _spider(args): except Exception as e: # Other types of errors are completely ignored, except in debug mode. - tty.debug("Error in _spider: %s" % e) + tty.debug("Error in _spider: %s:%s" % (type(e), e), + traceback.format_exc()) return pages, links -def spider(root_url, **kwargs): +def _spider_wrapper(args): + """Wrapper for using spider with multiprocessing.""" + return _spider(*args) + + +def spider(root_url, depth=0): + """Gets web pages from a root URL. - If depth is specified (e.g., depth=2), then this will also fetches pages - linked from the root and its children up to depth. + + If depth is specified (e.g., depth=2), then this will also follow + up to <depth> levels of links from the root. This will spawn processes to fetch the children, for much improved performance over a sequential fetch. + """ - max_depth = kwargs.setdefault('depth', 1) - pages, links = _spider((root_url, set(), root_url, None, - 1, max_depth, False)) + pages, links = _spider(root_url, set(), root_url, 0, depth, False) return pages, links -def find_versions_of_archive(*archive_urls, **kwargs): +def find_versions_of_archive(archive_urls, list_url=None, list_depth=0): """Scrape web pages for new versions of a tarball. Arguments: archive_urls: - URLs for different versions of a package. Typically these - are just the tarballs from the package file itself. By - default, this searches the parent directories of archives. + URL or sequence of URLs for different versions of a + package. Typically these are just the tarballs from the package + file itself. By default, this searches the parent directories + of archives. Keyword Arguments: list_url: - URL for a listing of archives. Spack wills scrape these pages for download links that look like the archive URL. list_depth: - Max depth to follow links on list_url pages. + Max depth to follow links on list_url pages. Default 0. """ - list_url = kwargs.get('list_url', None) - list_depth = kwargs.get('list_depth', 1) + if not isinstance(archive_urls, (list, tuple)): + archive_urls = [archive_urls] # Generate a list of list_urls based on archive urls and any # explicitly listed list_url in the package |