文件结构
html_downloader.py - 下载网页html内容
1
2
3
4
5
6
7
8
9
10
11
12
13
|
#!/usr/bin/python # -*- coding: UTF-8 -*- import urllib2 class HtmlDownloader( object ): def downlod( self , url): if url is None : return None response = urllib2.urlopen(url) if response.getcode() ! = 200 : return None return response.read() |
html_outputer.py - 输出结果到文件中
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
#!/usr/bin/python # -*- coding: UTF-8 -*- class HtmlOutputer( object ): def collect_data( self , movie_data): if movie_data is None : return fout = open ( 'output.html' , 'a+' ) for data in movie_data: print data[ 'name' ] + '|' , data[ 'rate' ] + '|' , data[ 'actor' ], '\n' fout.write( '%s,' % data[ 'name' ].encode( 'utf-8' )) fout.write( '%s,' % data[ 'rate' ]) fout.write( '%s\n' % data[ 'actor' ].encode( 'utf-8' )) fout.close() |
html_parser.py: 解析器:解析html的dom树
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
#!/usr/bin/python # -*- coding: UTF-8 -*- from bs4 import BeautifulSoup class HtmlParser( object ): def __init__( self ): pass def parser_html( self , cnt): if cnt is None : return soup = BeautifulSoup(cnt, 'html.parser' , from_encoding = 'utf-8' ) # movie_name, movie_desc, movie_rate = return self .get_movie_names(soup) def get_movie_names( self , soup): movie_data = [] movie_all = soup.find( 'div' , class_ = 'article' ).find_next( 'table' ).find_next_sibling( 'div' ).find_next_sibling( 'div' ).find_all( 'table' ) count = 1 for movie_one in movie_all: movie_data.append( self .get_movie_name(movie_one)) # if count > 2: # break count + = 1 return movie_data def get_movie_name( self , cnt): info = {} soup = BeautifulSoup( str (cnt), 'html.parser' , from_encoding = 'utf-8' ) movie_one = soup.find( 'tr' , class_ = 'item' ).find_next( 'td' ).find_next_sibling( 'td' ).find( 'div' , class_ = 'pl2' ) info[ 'name' ] = movie_one.find( 'a' ).get_text().replace( "\n" , " ").replace(" ", " ") info[ 'actor' ] = movie_one.find( 'p' , class_ = 'pl' ).get_text().replace( "\n" , " ").replace(" ", " ") info[ 'rate' ] = movie_one.find( 'div' , class_ = 'star clearfix' ).find( 'span' , class_ = 'rating_nums' ).get_text() return info |
spider_main.py - 主函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
#!/usr/bin/python # -*- coding: UTF-8 -*- import html_parser, html_outputer, html_downloader class SpiderMain( object ): def __init__( self ): self .parser = html_parser.HtmlParser() self .outputer = html_outputer.HtmlOutputer() self .downloader = html_downloader.HtmlDownloader() def craw( self , url): html_cnt = self .downloader.downlod(url) movie_data = self .parser.parser_html(html_cnt) self .outputer.collect_data(movie_data) if __name__ = = '__main__' : url = 'https://movie.douban.com/tag/2017?start=100&type=T' spider = SpiderMain() spider.craw(url) |
综述
其实就是使用了urllib2和BeautifulSoup库,没啥好说的,你也可以直接改url,然后更改html_parser.py文件来满足你自己的爬虫需求。当前也可以更改html_outputer.py来定义保存格式,目前是csv。