最近在学习爬虫的过程中,发现了一个开源的爬虫框架pyspider。 pyspider支持可视化创建和调试爬虫脚本。 后台启动pyspider进程后,可以在浏览器中输入http://127.0.0.1:5000 来在线访问测试页面。 pyspider支持单步运行脚本,可以方便地从页面提取元素的css样式。 看起来挺强大。 不过我在刚安装时就遇到了几个大坑。 我的环境是:CentOS6.5 x86_64、Python3.7.0。 通过pip install pyspider安装之后,运行pyspider,会报如下错误: Traceback (most recent call last): File "/usr/local/bin/pyspider", line 5, in <module> from pyspider.run import main File "/usr/local/lib/python3.7/site-packages/pyspider/run.py", line 231 async=True, get_object=False, no_input=False): ^ SyntaxError: invalid syntax 查看报错文件/pyspider/run.py,发现其中定义了一个fetcher函数: def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent, timeout, phantomjs_endpoint, splash_endpoint, fetcher_cls, async=True, get_object=False, no_input=False): 该函数使用的一个参数名为async。 async是Python3.7中的一个关键字,难怪报错了。 将run.py中的async全部替换为_async: sed -i 's/async/_async/g' /home/test/venv_pyspider/lib/python3.7/site-packages/pyspider/run.py 重新运行pyspider,又一个报错: Traceback (most recent call last): File "/home/test/venv_pyspider/bin/pyspider", line 8, in <module> sys.exit(main()) File "/home/test/venv_pyspider/lib/python3.7/site-packages/pyspider/run.py", line 754, in main cli() File "/home/test/venv_pyspider/lib/python3.7/site-packages/click/core.py", line 829, in __call__ return self.main(*args, **kwargs) File "/home/test/venv_pyspider/lib/python3.7/site-packages/click/core.py", line 782, in main rv = self.invoke(ctx) File "/home/test/venv_pyspider/lib/python3.7/site-packages/click/core.py", line 1236, in invoke return Command.invoke(self, ctx) File "/home/test/venv_pyspider/lib/python3.7/site-packages/click/core.py", line 1066, in invoke return ctx.invoke(self.callback, **ctx.params) File "/home/test/venv_pyspider/lib/python3.7/site-packages/click/core.py", line 610, in invoke return callback(*args, **kwargs) File "/home/test/venv_pyspider/lib/python3.7/site-packages/click/decorators.py", line 21, in new_func return f(get_current_context(), *args, **kwargs) File "/home/test/venv_pyspider/lib/python3.7/site-packages/pyspider/run.py", line 165, in cli ctx.invoke(all) File "/home/test/venv_pyspider/lib/python3.7/site-packages/click/core.py", line 610, in invoke return callback(*args, **kwargs) File "/home/test/venv_pyspider/lib/python3.7/site-packages/click/decorators.py", line 21, in new_func return f(get_current_context(), *args, **kwargs) File "/home/test/venv_pyspider/lib/python3.7/site-packages/pyspider/run.py", line 497, in all ctx.invoke(webui, **webui_config) File "/home/test/venv_pyspider/lib/python3.7/site-packages/click/core.py", line 610, in invoke return callback(*args, **kwargs) File "/home/test/venv_pyspider/lib/python3.7/site-packages/click/decorators.py", line 21, in new_func return f(get_current_context(), *args, **kwargs) File "/home/test/venv_pyspider/lib/python3.7/site-packages/pyspider/run.py", line 333, in webui app = load_cls(None, None, webui_instance) File "/home/test/venv_pyspider/lib/python3.7/site-packages/pyspider/run.py", line 48, in load_cls return utils.load_object(value) File "/home/test/venv_pyspider/lib/python3.7/site-packages/pyspider/libs/utils.py", line 369, in load_object module = __import__(module_name, globals(), locals(), [object_name]) File "/home/test/venv_pyspider/lib/python3.7/site-packages/pyspider/webui/__init__.py", line 8, in <module> from . import app, index, debug, task, result, login File "/home/test/venv_pyspider/lib/python3.7/site-packages/pyspider/webui/app.py", line 95 'fetch': lambda x: tornado_fetcher.Fetcher(None, None, async=False).fetch(x), 看最后一行,原来是/pyspider/webui/app.py中调用了上边定义的fetch函数,并传入了async参数。 嗯,还得把这个地方的async修改一下,改为_async。 再次运行pyspider,还有错误: Traceback (most recent call last): File "/usr/local/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap self.run() File "/usr/local/lib/python3.7/multiprocessing/process.py", line 99, in run self._target(*self._args, **self._kwargs) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 610, in invoke return callback(*args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/click/decorators.py", line 21, in new_func return f(get_current_context(), *args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/pyspider/run.py", line 236, in fetcher Fetcher = load_cls(None, None, fetcher_cls) File "/home/test/venv/lib/python3.7/site-packages/pyspider/run.py", line 48, in load_cls return utils.load_object(value) File "/home/test/venv/lib/python3.7/site-packages/pyspider/libs/utils.py", line 369, in load_object module = __import__(module_name, globals(), locals(), [object_name]) File "/home/test/venv/lib/python3.7/site-packages/pyspider/fetcher/__init__.py", line 1, in <module> from .tornado_fetcher import Fetcher File "/home/test/venv/lib/python3.7/site-packages/pyspider/fetcher/tornado_fetcher.py", line 81 def __init__(self, inqueue, outqueue, poolsize=100, proxy=None, async=True): 这次是/pyspider/fetcher/tornado_fetcher.py,里面也使用了async作为函数参数。同样将aysnc参数名为_async。 再运行spider,几秒后又发生报错: Traceback (most recent call last): File "/home/test/venv/bin/pyspider", line 8, in <module> sys.exit(main()) File "/home/test/venv/lib/python3.7/site-packages/pyspider/run.py", line 754, in main cli() File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 829, in __call__ return self.main(*args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 782, in main rv = self.invoke(ctx) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 1236, in invoke return Command.invoke(self, ctx) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 1066, in invoke return ctx.invoke(self.callback, **ctx.params) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 610, in invoke return callback(*args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/click/decorators.py", line 21, in new_func return f(get_current_context(), *args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/pyspider/run.py", line 165, in cli ctx.invoke(all) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 610, in invoke return callback(*args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/click/decorators.py", line 21, in new_func return f(get_current_context(), *args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/pyspider/run.py", line 497, in all ctx.invoke(webui, **webui_config) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 610, in invoke return callback(*args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/click/decorators.py", line 21, in new_func return f(get_current_context(), *args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/pyspider/run.py", line 384, in webui app.run(host=host, port=port) File "/home/test/venv/lib/python3.7/site-packages/pyspider/webui/app.py", line 59, in run from .webdav import dav_app File "/home/test/venv/lib/python3.7/site-packages/pyspider/webui/webdav.py", line 216, in <module> dav_app = WsgiDAVApp(config) File "/home/test/venv/lib/python3.7/site-packages/wsgidav/wsgidav_app.py", line 134, in __init__ _check_config(config) File "/home/test/venv/lib/python3.7/site-packages/wsgidav/wsgidav_app.py", line 118, in _check_config raise ValueError("Invalid configuration:\n - " + "\n - ".join(errors)) ValueError: Invalid configuration: - Deprecated option 'domaincontroller': use 'http_authenticator.domain_controller' instead. 这个提示说使用了非法配置,domaincontroller配置项已被废弃,应使用http_authenticator.domain_controller。 好吧,再改。 我们打开/home/test/venv/lib/python3.7/site-packages/pyspider/webui/webdav.py这个文件, 找到其中的domaincontroller,将其替换为http_authenticator.domain_controller。 重新运行pyspider,又报错了,疯不疯? Traceback (most recent call last): File "/home/test/venv/bin/pyspider", line 8, in <module> sys.exit(main()) File "/home/test/venv/lib/python3.7/site-packages/pyspider/run.py", line 754, in main cli() File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 829, in __call__ return self.main(*args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 782, in main rv = self.invoke(ctx) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 1236, in invoke return Command.invoke(self, ctx) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 1066, in invoke return ctx.invoke(self.callback, **ctx.params) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 610, in invoke return callback(*args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/click/decorators.py", line 21, in new_func return f(get_current_context(), *args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/pyspider/run.py", line 165, in cli ctx.invoke(all) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 610, in invoke return callback(*args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/click/decorators.py", line 21, in new_func return f(get_current_context(), *args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/pyspider/run.py", line 497, in all ctx.invoke(webui, **webui_config) File "/home/test/venv/lib/python3.7/site-packages/click/core.py", line 610, in invoke return callback(*args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/click/decorators.py", line 21, in new_func return f(get_current_context(), *args, **kwargs) File "/home/test/venv/lib/python3.7/site-packages/pyspider/run.py", line 384, in webui app.run(host=host, port=port) File "/home/test/venv/lib/python3.7/site-packages/pyspider/webui/app.py", line 64, in run from werkzeug.wsgi import DispatcherMiddleware ImportError: cannot import name 'DispatcherMiddleware' from 'werkzeug.wsgi' (/home/test/venv/lib/python3.7/site-packages/werkzeug/wsgi.py) 最后一行说从werkzeug.wsgi导入DispatcherMiddleware错误。 从网上查找这个报错,原来是pyspider使用的werkzeug版本过高导致, 需要使用0.16.1版本。 pip uninstall werkzeug -y pip install werkzeug==0.16.1 最后一次运行,终于跑起来了。 使用开源的东西不容易啊,希望作者能早日修复这些bug。 |
|
来自: RealPython > 《python 技术》