本文实例讲述了Python实现在线程里运行scrapy的方法。分享给大家供大家参考。具体如下:
如果你希望在一个写好的程序里调用scrapy,就可以通过下面的代码,让scrapy运行在一个线程里。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
""" Code to run Scrapy crawler in a thread - works on Scrapy 0.8 """ import threading, Queue from twisted.internet import reactor from scrapy.xlib.pydispatch import dispatcher from scrapy.core.manager import scrapymanager from scrapy.core.engine import scrapyengine from scrapy.core import signals class CrawlerThread(threading.Thread): def __init__( self ): threading.Thread.__init__( self ) self .running = False def run( self ): self .running = True scrapymanager.configure(control_reactor = False ) scrapymanager.start() reactor.run(installSignalHandlers = False ) def crawl( self , * args): if not self .running: raise RuntimeError( "CrawlerThread not running" ) self ._call_and_block_until_signal(signals.spider_closed, \ scrapymanager.crawl, * args) def stop( self ): reactor.callFromThread(scrapyengine.stop) def _call_and_block_until_signal( self , signal, f, * a, * * kw): q = Queue.Queue() def unblock(): q.put( None ) dispatcher.connect(unblock, signal = signal) reactor.callFromThread(f, * a, * * kw) q.get() # Usage example below: import os os.environ.setdefault( 'SCRAPY_SETTINGS_MODULE' , 'myproject.settings' ) from scrapy.xlib.pydispatch import dispatcher from scrapy.core import signals from scrapy.conf import settings from scrapy.crawler import CrawlerThread settings.overrides[ 'LOG_ENABLED' ] = False # avoid log noise def item_passed(item): print "Just scraped item:" , item dispatcher.connect(item_passed, signal = signals.item_passed) crawler = CrawlerThread() print "Starting crawler thread..." crawler.start() print "Crawling somedomain.com...." crawler.crawl('somedomain.com) # blocking call print "Crawling anotherdomain.com..." crawler.crawl( 'anotherdomain.com' ) # blocking call print "Stopping crawler thread..." crawler.stop() |
希望本文所述对大家的Python程序设计有所帮助。