本文实例为大家分享了python实现爬取图书封面的具体代码,供大家参考,具体内容如下
kongfuzi.py
利用更换代理ip,延迟提交数据,设置请求头破解网站的反爬虫机制
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
import requests import random import time class DownLoad(): def __init__( self ): self .ip_list = [ '191.33.179.242:8080' , '122.72.108.53:80' , '93.190.142.214:80' , '189.8.88.125:65301' , '36.66.55.181:8080' , '170.84.102.5:8080' , '177.200.72.214:20183' , '115.229.115.190:9000' ] self .user_agent_list = [ 'User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50' , 'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50' , 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' ] def get( self , url, proxy = None , timeout = 20 , num = 5 ): print ( "正在请求%s" % url) UA = random.choice( self .user_agent_list) headers = { 'User-Agent' : UA} if proxy = = None : try : return requests.get(url, headers = headers, timeout = timeout) except : if num > 0 : time.sleep( 10 ) return self .get(url, num = num - 1 ) else : time.sleep( 10 ) IP = ''.join(random.choice( self .ip_list).strip()) proxy = { 'http' : IP} return self .get(url, proxy = proxy, timeout = timeout) else : try : IP = ''.join(random.choice( self .ip_list).strip()) proxy = { 'http' : IP} return requests.get(url, headers = headers, proxy = proxy, timeout = timeout) except : if num > 0 : time.sleep( 10 ) IP = ''.join(random.choice( self .ip_list).strip()) proxy = { 'http' : IP} print ( "正在更换代理" ) print ( "当前代理%s" % proxy) return self .get(url, proxy = proxy, num = num - 1 ) |
main.py
将爬取的图片保存到本地,然后展示到界面
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
|
import kongfuzi import os import requests import bs4 from tkinter import * from PIL import Image, ImageTk # 下载图片,生成图片地址列表和图书信息列表 def download(): baseUrl = "http://search.kongfz.com" keyword = e1.get() url = baseUrl + "/product_result/?select=0&key=" + keyword print ( "下载链接:" + url) show(url) # bs4处理 def changesoup(html): htm = html.content html_doc = str (htm, 'utf-8' ) soup = bs4.BeautifulSoup(html_doc, "html.parser" ) return soup # 图书信息集合 def bookinfo(soup): # 图书价格列表 price = [] soupprice = soup.select( ".first-info .f_right .bold" ) for i in soupprice: price.append(i.string) # 书店名列表 storename = [] soupstorename = soup.select( ".text a span" ) for each in soupstorename: if each.string = = None : soupstorename.remove(each) for i in soupstorename: storename.append(i.string) # 商家地区列表 place = [] soupplace = soup.select( ".user-place" ) for i in soupplace: place.append(i.string) # 书名列表 bookname = [] bookname1 = soup.select( ".search-wrap .search-main .search-main-result .result-content .result-list .item .item-info .title .link" ) # print(len(bookname1)) # print(bookname1) for each in bookname1: print (each) # a = bs4.BeautifulSoup(each, "html.parser") a = each.get_text() print (a) # type(a) # a = bs4.BeautifulSoup(a, "html.parser") # b = a.get_text() bookname.append(a) # print(bookname) # print(len(bookname)) return bookname, price, place, storename # 保存图片 def imgsave(soup): dirName = "image" os.makedirs(dirName, exist_ok = True ) filePathList = [] imgUrl = soup.select( ".search-main-result .result-content .result-list .item .item-img .img-box img" ) # print(imgUrl) if not imgUrl: print ( "没有找到当前节点下图片" ) else : i = 0 for imageUrls in imgUrl: # 找到图片地址 获取它 downloadUrl = imageUrls.get( 'src' ) # if downloadUrl == "/searchfront/img/error.jpg": # downloadUrl = "http://book.kongfz.com/img/pc/error.jpg" print ( "打印要下载的图片地址:" , downloadUrl) # http://book.kongfz.com/img/pc/error.jpg # 分割字符 split = downloadUrl.split( "/" ) # 只保留最后一个元素 fileName = str (i) + "-" + os.path.basename(split[ len (split) - 1 ]) print ( "文件名:" + fileName) # 建立一个新路径 filePath = os.path.join(dirName, fileName) filePathList.append(filePath) if not os.path.exists(filePath): imageUrlPath = requests.get(downloadUrl) # 检查当前网络是否请求成功 imageUrlPath.raise_for_status() # 'wb'二进制模式打开img适用 imageFile = open (filePath, 'wb' ) for image in imageUrlPath.iter_content( 10000 ): # 把每次遍历的文件图像都存储进文件夹中 imageFile.write(image) # 关闭文件 imageFile.close() i = i + 1 return filePathList # 图片展示 def show(url): xz = kongfuzi.DownLoad() html = xz.get(url) # 添加代理ip到ip_list add_ip = e2.get() xz.ip_list.append(add_ip) soup = changesoup(html) bookname, price, place, storename = bookinfo(soup) # print(bookname) # print(price) # print(place) # print(storename) filePathList = imgsave(soup) root1 = Toplevel() root1.geometry( "1720x800" ) root1.title( "孔网图片爬取" ) # 处理图片,转换成可以显示 photo = [] temp = [] for each in filePathList: temp = Image. open (each) photo.append(ImageTk.PhotoImage(temp)) canvas = Canvas(root1, width = 1700 , height = 800 , scrollregion = ( 0 , 0 , 0 , 4000 )) # 创建canvas canvas.place(x = 10 , y = 10 ) # 放置canvas的位置 frame = Frame(canvas) # 把frame放在canvas里 frame.place(width = 1680 , height = 800 ) for i in range ( 50 ): # 图片行列 rownum = int (i / 5 ) columnnum = i % 5 # photo = ImageTk.PhotoImage(Image.open(filePathList[i])) imgLabel1 = Label(frame, image = photo[i], width = 280 , height = 280 ) imgLabel1.grid(row = rownum * 5 , column = columnnum, padx = 10 , pady = 5 ) infoLabel1 = Label(frame, text = "书名:" + bookname[i], bg = "#FFF8DC" , justify = LEFT) infoLabel1.grid(row = rownum * 5 + 1 , column = columnnum, padx = 45 , pady = 2 , sticky = W) infoLabel2 = Label(frame, text = "价格:" + price[i] + "元" , bg = "#FFF8DC" , justify = LEFT) infoLabel2.grid(row = rownum * 5 + 2 , column = columnnum, padx = 45 , pady = 2 , sticky = W) infoLabel3 = Label(frame, text = "发货地区:" + place[i], bg = "#FFF8DC" , justify = LEFT) infoLabel3.grid(row = rownum * 5 + 3 , column = columnnum, padx = 45 , pady = 2 , sticky = W) infoLabel4 = Label(frame, text = "书店:" + storename[i], bg = "#FFF8DC" , justify = LEFT) infoLabel4.grid(row = rownum * 5 + 4 , column = columnnum, padx = 45 , pady = 2 , sticky = W) vbar = Scrollbar(canvas, orient = VERTICAL) # 竖直滚动条 vbar.place(x = 1680 , width = 20 , height = 800 ) vbar.configure(command = canvas.yview) canvas.config(yscrollcommand = vbar. set ) # 设置 canvas.create_window(( 800 , 2000 ), window = frame) mainloop() if __name__ = = '__main__' : # 界面 root = Tk() root.title( "孔网图片爬取" ) e1 = Entry(root) e2 = Entry(root) e1.grid(row = 0 , column = 0 , padx = 20 , pady = 20 ) e2.grid(row = 0 , column = 2 , padx = 20 , pady = 20 ) label1 = Label(root, text = "关键字" , width = 10 ).grid(row = 0 , column = 1 , padx = 10 , pady = 5 ) label2 = Label(root, text = "添加代理ip" , width = 10 ).grid(row = 0 , column = 3 , padx = 10 , pady = 5 ) btn1 = Button(root, text = "搜索" , width = 10 , command = download).grid(row = 1 , column = 1 , padx = 10 , pady = 5 ) # print(e1.get()) mainloop() |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/Bancroft_boy/article/details/80904322