本文实例讲述了基于scrapy实现的简单蜘蛛采集程序。分享给大家供大家参考。具体如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
# Standard Python library imports # 3rd party imports from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector # My imports from poetry_analysis.items import PoetryAnalysisItem HTML_FILE_NAME = r '.+\.html' class PoetryParser( object ): """ Provides common parsing method for poems formatted this one specific way. """ date_pattern = r '(\d{2} \w{3,9} \d{4})' def parse_poem( self , response): hxs = HtmlXPathSelector(response) item = PoetryAnalysisItem() # All poetry text is in pre tags text = hxs.select( '//pre/text()' ).extract() item[ 'text' ] = ''.join(text) item[ 'url' ] = response.url # head/title contains title - a poem by author title_text = hxs.select( '//head/title/text()' ).extract()[ 0 ] item[ 'title' ], item[ 'author' ] = title_text.split( ' - ' ) item[ 'author' ] = item[ 'author' ].replace( 'a poem by' , '') for key in [ 'title' , 'author' ]: item[key] = item[key].strip() item[ 'date' ] = hxs.select( "//p[@class='small']/text()" ).re(date_pattern) return item class PoetrySpider(CrawlSpider, PoetryParser): name = 'example.com_poetry' allowed_domains = [ 'www.example.com' ] root_path = 'someuser/poetry/' start_urls = [ 'http://www.example.com/someuser/poetry/recent/' , 'http://www.example.com/someuser/poetry/less_recent/' ] rules = [Rule(SgmlLinkExtractor(allow = [start_urls[ 0 ] + HTML_FILE_NAME]), callback = 'parse_poem' ), Rule(SgmlLinkExtractor(allow = [start_urls[ 1 ] + HTML_FILE_NAME]), callback = 'parse_poem' )] |
希望本文所述对大家的Python程序设计有所帮助。