最近在学习Python,自然接触到了爬虫,写了一个小型爬虫软件,从初始Url解析网页,使用正则获取待爬取链接,使用beautifulsoup解析获取文本,使用自己写的输出器可以将文本输出保存,具体代码如下:
Spider_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
# coding:utf8 from baike_spider import url_manager, html_downloader, html_parser, html_outputer class SpiderMain( object ): def __init__( self ): self .urls = url_manager.UrlManager() self .downloader = html_downloader.HtmlDownloader() self .parser = html_parser.HtmlParser() self .outputer = html_outputer.HtmlOutputer() def craw( self , root_url): count = 1 self .urls.add_new_url(root_url) while self .urls.has_new_url(): print ( "self.urls.has %s" % self .urls.new_urls) try : new_url = self .urls.get_new_url() print ( "craw %d : %s" % (count, new_url)) html_cont = self .downloader.download(new_url) new_urls, new_data = self .parser.parse(new_url, html_cont) self .urls.add_new_urls(new_urls) self .outputer.collect_data(new_data) if count = = 1000 : break count = count + 1 except : print ( "craw failed" ) self .outputer.output_html() self .outputer.output_txt() if __name__ = = '__main__' : root_url = "http://www.shushu8.com/jiangnan/longzu2qianzhuan/1" obj_spider = SpiderMain() obj_spider.craw(root_url) |
url_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
class UrlManager( object ): def __init__( self ): self .new_urls = set () self .old_urls = set () def add_new_url( self , url): print (url) if url is None : return if url not in self .new_urls and url not in self .old_urls: self .new_urls.add(url) def has_new_url( self ): return len ( self .new_urls) ! = 0 def get_new_url( self ): new_url = self .new_urls.pop() self .old_urls.add(new_url) # print('new url is %s' % new_url) return new_url def add_new_urls( self , urls): print ( "add_new_urls %s" % urls) if urls is None or len (urls) = = 0 : return for url in urls: self .add_new_url(url) print (url) |
html_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
import re import urllib.parse from bs4 import BeautifulSoup class HtmlParser( object ): def parse( self , page_url, html_cont): if page_url is None or html_cont is None : return soup = BeautifulSoup(html_cont, 'html.parser' , from_encoding = 'utf-8' ) new_urls = self ._get_new_urls(page_url, soup) print ( "parse new_urls %s" % new_urls) new_data = self ._get_new_data(page_url, soup) return new_urls, new_data def _get_new_data( self , page_url, soup): res_data = {} res_data[ 'url' ] = page_url print (page_url) title_node = soup.find( class_ = "title" ).find( "h1" ) print (title_node.get_text()) res_data[ 'title' ] = title_node.get_text() print ( "_get_new_data" ) summary_node = soup.find( 'pre' ) print (summary_node.get_text()) res_data[ 'summary' ] = summary_node.get_text() return res_data def _get_new_urls( self , page_url, soup): new_urls = set () links = soup.find_all( 'a' , href = re. compile (r "/jiangnan/" )) print (links) for link in links: new_url = link[ 'href' ] new_full_url = urllib.parse.urljoin(page_url, new_url) new_urls.add(new_full_url) # print(new_full_url) return new_urls |
html_downloader.py
1
2
3
4
5
6
7
8
9
|
import urllib.request class HtmlDownloader( object ): def download( self , url): if url is None : return None response = urllib.request.urlopen(url) if response.getcode() ! = 200 : return None return response.read() |
html_outputer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
class HtmlOutputer( object ): def __init__( self ): self .datas = [] def collect_data( self , data): if data is None : return self .datas.append(data) def output_txt( self ): fout = open ( 'output.txt' , 'w' , encoding = 'utf-8' ) for data in self .datas: fout.write( '%s \n' % data[ 'title' ]) fout.write( '%s \n' % data[ 'summary' ]) def output_html( self ): fout = open ( 'output.html' , 'w' , encoding = 'utf-8' ) fout.write( '<html>' ) fout.write( '<body>' ) fout.write( '<table>' ) for data in self .datas: fout.write( '<tr>' ) fout.write( '<td>%s</td>' % data[ 'url' ]) fout.write( '<td>%s</td>' % data[ 'title' ]) fout.write( '<td>%s</td>' % data[ 'summary' ]) fout.write( '</tr>' ) fout.write( '</table>' ) fout.write( '</body>' ) fout.write( '</html>' ) fout.close() |
总结
以上所述是小编给大家介绍的Python实现爬虫从网络上下载文档的实例代码,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对服务器之家网站的支持!
原文链接:https://www.cnblogs.com/hasan/archive/2018/06/12/9175592.html