淘宝的页面很复杂,如果使用分析ajax或者js的方式,很麻烦
抓取淘宝‘美食'上面的所有食品信息
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
#encoding:utf8 import re from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup from config import * import pymongo client = pymongo.MongoClient(MONGODB_URL) db = client[MONGODB_DB] ##这里使用PhantomJS,并配置了一些参数 browser = webdriver.PhantomJS(service_args = SERVICE_ArGS) ##窗口的大小,不设置的话,默认太小,会有问题 browser.set_window_size( 1400 , 900 ) wait = WebDriverWait(browser, 10 ) def search(): print ( '正在搜索' ) ##容易出现超时的错误 try : ##等待这两个模块都加载好 browser.get( "https://www.taobao.com" ) input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#q' )) ) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button' )) ) ######这块python2搞得鬼 #input.send_keys('\u7f8e\u98df'.decode("unicode-escape")) input .send_keys(KEYWORD.decode( "unicode-escape" )) submit.click() total = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total' )) ) get_product() return total.text except TimeoutException: return search() def next_page(page_number): print ( '翻页' + str (page_number)) try : input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input' )) ) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit' )) ) input .clear() input .send_keys(page_number) submit.click() ##判断是否翻页成功 高亮的是不是输入的值,直接加在后面即可 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span' ), str (page_number))) get_product() except TimeoutException: return next_page(page_number) #获取产品信息 def get_product(): products = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .m-itemlist .items' )) ) ##拿到网页 html = browser.page_source soup = BeautifulSoup(html, 'lxml' ) items = soup.select( '#mainsrp-itemlist .m-itemlist .items .item.J_MouserOnverReq' )# print ( '*************************到此*************' ) for item in items: img = item.select( '.J_ItemPic.img' )[ 0 ].get( 'src' ) price = item.select( '.price.g_price.g_price-highlight > strong' )[ 0 ].get_text() deal = item.select( '.deal-cnt' )[ 0 ].get_text() title = item.select( '.row.row-2.title > a ' )[ 0 ].get_text().strip() #:nth-of-type(3) shop = item.select( '.row.row-3.g-clearfix > .shop > a > span:nth-of-type(2)' )[ 0 ].get_text() location = item.select( '.location' )[ 0 ].get_text() product = { 'img' :img, 'price' :price, 'deal' :deal, 'title' :title, 'shop' :shop, 'location' :location } #打印一下 import json j = json.dumps(product) dict2 = j.decode( "unicode-escape" ) print dict2 save_to_mongo(product) def save_to_mongo(product): try : if db[MONGODB_TABLE].insert(product): print ( '存储到mongodb成功' + str (product)) except Exception: print ( "存储到mongodb失败" + str (product)) def main(): try : total = search() ##搜寻 re正则表达式 s = re. compile ( '(\d+)' ) total = int (s.search(total).group( 1 )) for i in range ( 2 ,total + 1 ): next_page(i) except Exception: print ( '出错' ) finally : browser.close() if __name__ = = '__main__' : main() |
config.py
1
2
3
4
5
6
7
8
|
MONGODB_URL = 'localhost' MONGODB_DB = 'taobao' MONGODB_TABLE = 'meishi' SERVICE_ArGS = [ '--load-images=false' , '--disk-cache=true' ] ##就是美食这两个字,直接用汉字会报错 KEYWORD = '\u7f8e\u98df' |
以上这篇Python使用Selenium爬取淘宝异步加载的数据方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/wqh_jingsong/article/details/66472106