一般来说,使用线程有两种模式, 一种是创建线程要执行的函数, 把这个函数传递进Thread对象里,让它来执行. 另一种是直接从Thread继承,创建一个新的class,把线程执行的代码放到这个新的class里。
实现多线程网页爬虫,采用了多线程和锁机制,实现了广度优先算法的网页爬虫。
先给大家简单介绍下我的实现思路:
对于一个网络爬虫,如果要按广度遍历的方式下载,它是这样的:
1.从给定的入口网址把第一个网页下载下来
2.从第一个网页中提取出所有新的网页地址,放入下载列表中
3.按下载列表中的地址,下载所有新的网页
4.从所有新的网页中找出没有下载过的网页地址,更新下载列表
5.重复3、4两步,直到更新后的下载列表为空表时停止
python代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
#!/usr/bin/env python #coding=utf-8 import threading import urllib import re import time g_mutex = threading.Condition() g_pages = [] #从中解析所有url链接 g_queueURL = [] #等待爬取的url链接列表 g_existURL = [] #已经爬取过的url链接列表 g_failedURL = [] #下载失败的url链接列表 g_totalcount = 0 #下载过的页面数 class Crawler: def __init__( self ,crawlername,url,threadnum): self .crawlername = crawlername self .url = url self .threadnum = threadnum self .threadpool = [] self .logfile = file ( "log.txt" , 'w' ) def craw( self ): global g_queueURL g_queueURL.append(url) depth = 0 print self .crawlername + " 启动..." while ( len (g_queueURL)! = 0 ): depth + = 1 print 'Searching depth ' ,depth, '...\n\n' self .logfile.write( "URL:" + g_queueURL[ 0 ] + "........" ) self .downloadAll() self .updateQueueURL() content = '\n>>>Depth ' + str (depth) + ':\n' self .logfile.write(content) i = 0 while i< len (g_queueURL): content = str (g_totalcount + i) + '->' + g_queueURL[i] + '\n' self .logfile.write(content) i + = 1 def downloadAll( self ): global g_queueURL global g_totalcount i = 0 while i< len (g_queueURL): j = 0 while j< self .threadnum and i + j < len (g_queueURL): g_totalcount + = 1 threadresult = self .download(g_queueURL[i + j], str (g_totalcount) + '.html' ,j) if threadresult! = None : print 'Thread started:' ,i + j, '--File number =' ,g_totalcount j + = 1 i + = j for thread in self .threadpool: thread.join( 30 ) threadpool = [] g_queueURL = [] def download( self ,url,filename,tid): crawthread = CrawlerThread(url,filename,tid) self .threadpool.append(crawthread) crawthread.start() def updateQueueURL( self ): global g_queueURL global g_existURL newUrlList = [] for content in g_pages: newUrlList + = self .getUrl(content) g_queueURL = list ( set (newUrlList) - set (g_existURL)) def getUrl( self ,content): regob = re. compile (reg,re.DOTALL) urllist = regob.findall(content) return urllist class CrawlerThread(threading.Thread): def __init__( self ,url,filename,tid): threading.Thread.__init__( self ) self .url = url self .filename = filename self .tid = tid def run( self ): global g_mutex global g_failedURL global g_queueURL try : page = urllib.urlopen( self .url) html = page.read() fout = file ( self .filename, 'w' ) fout.write(html) fout.close() except Exception,e: g_mutex.acquire() g_existURL.append( self .url) g_failedURL.append( self .url) g_mutex.release() print 'Failed downloading and saving' , self .url print e return None g_mutex.acquire() g_pages.append(html) g_existURL.append( self .url) g_mutex.release() if __name__ = = "__main__" : url = raw_input ( "请输入url入口:\n" ) threadnum = int ( raw_input ( "设置线程数:" )) crawlername = "小小爬虫" crawler = Crawler(crawlername,url,threadnum) crawler.craw() |
以上代码就是给大家分享的基python实现多线程网页爬虫,希望大家喜欢。