分析
使用crawlspider结合linkextractor和rule爬取网页信息
linkextractor用于定义链接提取规则,一般使用allow参数即可
1
2
3
4
5
6
7
8
9
10
11
12
13
|
linkextractor(allow = (), # 使用正则定义提取规则 deny = (), # 排除规则 allow_domains = (), # 限定域名范围 deny_domains = (), # 排除域名范围 restrict_xpaths = (), # 使用xpath定义提取队则 tags = ( 'a' , 'area' ), attrs = ( 'href' ,), canonicalize = false, unique = true, process_value = none, deny_extensions = none, restrict_css = (), # 使用css选择器定义提取规则 strip = true): |
rule用于定义crawlspider的爬取规则,由spider内部自动识别,提交请求、获取响应,交给callback指定的回调方法处理response
如果指定了callback,参数follow默认为false;如果callback为none,follow默认为true
1
2
3
4
5
6
|
rule(link_extractor, # linkextractor对象,必选参数 callback = none, # 回调方法,可选 cb_kwargs = none, follow = none, # 是否进行深度爬取,true、false process_links = none, # 用于处理链接(有些反爬策略是返回假的url) process_request = identity) |
源码
items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
class bosszhipinitem(scrapy.item): """boss直聘pytho职位爬虫item""" # 职位名称 position = scrapy.field() # 公司名称 company = scrapy.field() # 薪资 salary = scrapy.field() # 工作地点 location = scrapy.field() # 学历要求 education = scrapy.field() # 工作时间 year = scrapy.field() |
spiders/bosszhipin_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
# !/usr/bin/env python # -*- coding:utf-8 -*- import scrapy from scrapy.spider import crawlspider,rule from scrapy.linkextractors import linkextractor from myscrapy.items import bosszhipinitem class bosszhipinspider(crawlspider): """ boss直聘python职位爬虫spider 使用crawlspider基类实现 """ name = 'bosszhipin' allowed_domains = [ 'zhipin.com' ,] start_urls = [ 'http://www.zhipin.com/c100010000/h_100010000/?query=python&page=1' ,] # 链接提取器对象(规定链接提取规则) link_extractor = linkextractor(allow = (r 'page=\d+' )) # 链接提取规则对象列表 # 自动调用callback指定的方法,去取爬取由link_extractor指定的链接提取规则匹配到的url # 原理:link_extractor.extract_links(response)返回匹配到的链接 rules = [ rule(link_extractor = link_extractor,callback = 'parse_page' ,follow = true), ] def parse_page( self ,response): """定义回调方法,用于解析每个response对象""" job_list = response.xpath( '//div[@class="job-list"]//li' ) for job in job_list: position = job.xpath( './/div[@class="info-primary"]//h3[@class="name"]/a/text()' )[ 0 ].extract() salary = job.xpath( './/div[@class="info-primary"]//h3[@class="name"]//span/text()' )[ 0 ].extract() company = job.xpath( './/div[@class="company-text"]//a/text()' )[ 0 ].extract() location = job.xpath( './/div[@class="info-primary"]/p/text()[1]' )[ 0 ].extract() year = job.xpath( './/div[@class="info-primary"]/p/text()[2]' )[ 0 ].extract() education = job.xpath( './/div[@class="info-primary"]/p/text()[3]' )[ 0 ].extract() item = bosszhipinitem() item[ 'position' ] = position item[ 'salary' ] = salary item[ 'company' ] = company item[ 'location' ] = location item[ 'year' ] = year item[ 'education' ] = education yield item |
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
|
class bosszhipinpipeline( object ): """boss直聘python职位爬虫item pipeline""" def __init__( self ): self .f = open ( 'data/bosszhipin.json' ,mode = 'wb' ) self .f.write(b '[' ) def process_item( self ,item,spider): data = json.dumps( dict (item),ensure_ascii = false,indent = 4 ) self .f.write(data.encode( 'utf-8' )) self .f.write(b ',' ) return item def close_spider( self ,spider): self .f.write(b ']' ) self .f.close() |
settings.py
1
2
3
|
item_pipelines = { 'myscrapy.pipelines.bosszhipinpipeline' : 1 , } |
运行结果
总结
以上就是这篇文章的全部内容了,希望本文的内容对大家的学习或者工作具有一定的参考学习价值,谢谢大家对服务器之家的支持。如果你想了解更多相关内容请查看下面相关链接
原文链接:https://blog.csdn.net/topleeyap/article/details/78907149