最基本的抓取网页内容的代码实现:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
#!/usr/bin/env python from urllib import urlretrieve def firstNonBlank(lines): for eachLine in lines: if not eachLine.strip(): continue else : return eachLine def firstLast(webpage): f = open (webpage) lines = f.readlines() f.close() print firstNonBlank(lines), lines.reverse() print firstNonBlank(lines), def download(url = 'http://www' ,process = firstLast): try : retval = urlretrieve(url)[ 0 ] except IOError: retval = None if retval: process(retval) if __name__ = = '__main__' : download() |
利用urllib模块,来实现一个网页中针对图片的抓取功能:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
import urllib.request import socket import re import sys import os targetDir = r "C:\Users\elqstux\Desktop\pic" def destFile(path): if not os.path.isdir(targetDir): os.mkdir(targetDir) pos = path.rindex( '/' ) t = os.path.join(targetDir, path[pos + 1 :]) return t if __name__ = = "__main__" : hostname = "http://www.douban.com" req = urllib.request.Request(hostname) webpage = urllib.request.urlopen(req) contentBytes = webpage.read() for link, t in set (re.findall(r '(http:[^\s]*?(jpg|png|gif))' , str (contentBytes))): print (link) urllib.request.urlretrieve(link, destFile(link)) |
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
import urllib.request import socket import re import sys import os targetDir = r "H:\pic" def destFile(path): if not os.path.isdir(targetDir): os.mkdir(targetDir) pos = path.rindex( '/' ) t = os.path.join(targetDir, path[pos + 1 :]) #会以/作为分隔 return t if __name__ = = "__main__" : hostname = "http://www.douban.com/" req = urllib.request.Request(hostname) webpage = urllib.request.urlopen(req) contentBytes = webpage.read() match = re.findall(r '(http:[^\s]*?(jpg|png|gif))' , str (contentBytes) ) #r'(http:[^\s]*?(jpg|png|gif))'中包含两层圆括号,故有两个分组, #上面会返回列表,括号中匹配的内容才会出现在列表中 for picname, picType in match: print (picname) print (picType) ''''' 输出: http://img3.douban.com/pics/blank.gif gif http://img3.douban.com/icon/g111328-1.jpg jpg http://img3.douban.com/pics/blank.gif gif http://img3.douban.com/icon/g197523-19.jpg jpg http://img3.douban.com/pics/blank.gif gif ... ''' |