分析
需求:
爬取西刺代理网免费高匿代理,并保存到mysql数据库中。
这里只爬取前10页中的数据。
思路:
- 分析网页结构,确定数据提取规则
- 创建scrapy项目
- 编写item,定义数据字段
- 编写spider,实现数据抓取
- 编写pipeline,保存数据到数据库中
- 配置settings.py文件
- 运行爬虫项目
代码实现
items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
import scrapy class xicidailiitem(scrapy.item): # 国家 country = scrapy.field() # ip地址 ip = scrapy.field() # 端口号 port = scrapy.field() # 服务器地址 address = scrapy.field() # 是否匿名 anonymous = scrapy.field() # 类型 type = scrapy.field() # 速度 speed = scrapy.field() # 连接时间 connect_time = scrapy.field() # 存活时间 alive_time = scrapy.field() # 验证时间 verify_time = scrapy.field() |
xicidaili_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
# !/usr/bin/env python # -*- coding:utf-8 -*- import scrapy from myscrapy.items import xicidailiitem class xicidailispider(scrapy.spider): name = 'xicidaili' allowed_domains = [ 'www.xicidaili.com' ] # start_urls=['http://www.xicidaili.com/nn/1'] def start_requests( self ): urls = [] for i in range ( 1 , 11 ): urls.append( 'http://www.xicidaili.com/nn/' + str (i)) for url in urls: yield scrapy.request(url,callback = self .parse,method = 'get' ) def parse( self , response): tr_list = response.xpath( '//table[@id="ip_list"]/tr' ) for tr in tr_list[ 1 :]: # 过滤掉表头行 item = xicidailiitem() item[ 'country' ] = tr.xpath( './td[1]/img/@alt' ).extract_first() item[ 'ip' ] = tr.xpath( './td[2]/text()' ).extract_first() item[ 'port' ] = tr.xpath( './td[3]/text()' ).extract_first() item[ 'address' ] = tr.xpath( './td[4]/a/text()' ).extract_first() item[ 'anonymous' ] = tr.xpath( './td[5]/text()' ).extract_first() item[ 'type' ] = tr.xpath( './td[6]/text()' ).extract_first() item[ 'speed' ] = tr.xpath( './td[7]/div/@title' ).re(r '\d{1,3}\.\d{0,}' )[ 0 ] item[ 'connect_time' ] = tr.xpath( './td[8]/div/@title' ).re(r '\d{1,3}\.\d{0,}' )[ 0 ] item[ 'alive_time' ] = tr.xpath( './td[9]/text()' ).extract_first() item[ 'verify_time' ] = tr.xpath( './td[10]/text()' ).extract_first() yield item |
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
class xicidailipipeline( object ): """ 西刺代理爬虫 item pipeline create table xicidaili( id int primary key auto_increment, country varchar(10) not null, ip varchar(30) not null, port varchar(10) not null, address varchar(30) not null, anonymous varchar(10) not null, type varchar(20) not null, speed varchar(10) not null, connect_time varchar(20) not null, alive_time varchar(20) not null, verify_time varchar(20) not null); """ def __init__( self ): self .connection = pymysql.connect(host = 'localhost' , user = 'root' , password = '123456' , db = 'mydb' , charset = 'utf8' , # 不能用utf-8 cursorclass = pymysql.cursors.dictcursor) def process_item( self ,item,spider): with self .connection.cursor() as cursor: sql = 'insert into xicidaili' \ '(country,ip,port,address,anonymous,type,speed,connect_time,alive_time,verify_time) values' \ '(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);' args = (item[ 'country' ],item[ 'ip' ],item[ 'port' ],item[ 'address' ],item[ 'anonymous' ],item[ 'type' ],item[ 'speed' ],item[ 'connect_time' ],item[ 'alive_time' ],item[ 'verify_time' ]) spider.logger.info(args) cursor.execute(sql,args) self .connection.commit() def close_spider( self ,spider): self .connection.close() |
settings.py
1
2
3
|
item_pipelines = { 'myscrapy.pipelines.xicidailipipeline' : 300 , } |
结果
总结
以上就是这篇文章的全部内容了,希望本文的内容对大家的学习或者工作具有一定的参考学习价值,谢谢大家对服务器之家的支持。如果你想了解更多相关内容请查看下面相关链接
原文链接:https://blog.csdn.net/topleeyap/article/details/79145147