进入淘宝网,分别按综合、销量排序抓取100页的所有商品的列表信息。
1、按综合
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
import re from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from pyquery import PyQuery as pq #获取整个网页的源代码 from config import * #可引用congif的所有变量 import pymongo import pymysql # client=pymongo.MongoClient(MONGO_URL) # db = client[MONGO_DB] # 按综合排序 100页 # 打开淘宝链接,输入‘美食',搜索 # 自动翻页:先得到总页数,再转到 _ 页,确定 # # browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) # browser =webdriver.Chrome() browser = webdriver.Firefox() wait = WebDriverWait(browser, 10 ) def search(): print ( '正在搜索...' ) try : browser.get( 'https://www.taobao.com' ) #用这个网页'https://s.taobao.com',无法输入keywords input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#q' )) #打开淘宝,右击查看元素,定位到搜索框,选择对应代码,复制 - CSS选择器,其实就是‘#q'。 ) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button' ))) input .send_keys(KEYWORD) #模拟操作,输入内容 submit.click() #点击提交 total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total' ))) #页数 return total.text except TimeoutException : return search() # 翻页 def next_page(page_number): print ( '正在翻页' ,page_number) try : input = wait.until( # 输入框 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input' )) # 打开淘宝,右击查看元素,定位到搜索框,选择对应代码,复制 - CSS选择器,其实就是‘#q'。 ) # 搜索按钮 submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit' ))) #未修改 input .clear() input .send_keys(page_number) # 模拟操作,输入页码 submit.click() #判断翻页是否成功,找到高亮页码数,由数子判断 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR , '#mainsrp-pager > div > div > div > ul > li.item.active > span' ), str (page_number))) get_products() except TimeoutException : next_page(page_number) # 解析,获取每页的商品并输出 def get_products(): wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item' ))) #加载所有宝贝 html = browser.page_source doc = pq(html) items = doc( '#mainsrp-itemlist .items .item' ).items() for item in items: product = { # 'picture':item.find('.pic .img').attr('src'),#用find去获取内部元素,选择器是 pic,img,用attr获取属性 'image' : item.find( '.pic .img' ).attr( 'data-src' ), # 用find去获取内部元素,选择器是 pic,img,用attr获取属性 'shop_id' : item.find( '.shop' ).find( 'a' ).attr( 'data-userid' ), # 店铺 id 'data_id' : item.find( '.shop' ).find( 'a' ).attr( 'data-nid' ), # 商品 id 'link' : item.find( '.pic-box-inner' ).find( '.pic' ).find( 'a' ).attr[ 'href' ], 'price' :item.find( '.price' ).text()[ 1 : - 3 ], # 用text获取内容 'deal' :item.find( '.deal-cnt' ).text()[: - 3 ], 'title' :item.find( '.title' ).text().replace( ' ' ,''), 'shop' :item.find( '.shop' ).text(), 'location' :item.find( '.location' ).text() } # print(product) # print(product['location']) save_to_mysql(product) ''''' def main(): try: # search() total=search() # 此时 total = ‘共 100 页,' total=int(re.compile('(\d+)').search(total).group(1)) # 用正则表达式提取数字100 # print(total) for i in range(2,total+1): next_page(i) except Exception: print('出错啦') finally: # 不管有没有异常,都要执行此操作 browser.close() # 关浏览器 ''' def main(): total = search() total = int (re. compile ( '(\d+)' ).search(total).group( 1 )) for i in range ( 2 ,total + 1 ): next_page(i) #显示当前爬取网页的页数 print ( '搞定%d' % i) def save_to_mysql(product): # print(product['location']) #,use_unicode = False try : conn = pymysql.connect(host = 'localhost' , user = 'root' , passwd = ' ' , db = 'test1' , port = 3306 ,charset = 'utf8' ) cur = conn.cursor() # 创建一个游标对象 sql = """INSERT INTO women_clothes_zonghe VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)""" cur.execute(sql, (product[ 'shop_id' ],product[ 'shop' ], product[ 'link' ],product[ 'data_id' ], product[ 'title' ], product[ 'price' ], product[ 'location' ],product[ 'deal' ],product[ 'image' ])) # cur.execute(sql) print ( '- - - - - 数据保存成功 - - - - -' ) cur.close() conn.commit() conn.close() # 关闭数据 except pymysql.Error as e: print (e) if __name__ = = '__main__' : # 连接数据库 conn = pymysql.connect(host = 'localhost' , user = 'root' , passwd = ' ' , db = 'test1' , port = 3306 ,charset = "utf8" ) cur = conn.cursor() # 创建一个游标对象 cur.execute( "DROP TABLE IF EXISTS women_clothes_zonghe" ) # 如果表存在则删除 # 创建表sql语句 sqlc = """CREATE TABLE women_clothes_zonghe( shop_id VARCHAR(500), shop VARCHAR(500), link VARCHAR(1000), data_id varchar(100), title VARCHAR(1000), price VARCHAR(500), location VARCHAR(500), deal VARCHAR(500), image VARCHAR(1000) )""" cur.execute(sqlc) # 执行创建数据表操作 main() |
2、按销量
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
import re from bs4 import BeautifulSoup from pyquery import PyQuery as pq #获取整个网页的源代码 from config import * #可引用congif的所有变量 import pymongo import pymysql import urllib import requests import json import bs4 from selenium import webdriver from pyquery import PyQuery as pq #获取整个网页的源代码 # 完整爬取所有页面的商品信息 共100页 按销量排序 browser = webdriver.Firefox() wait = WebDriverWait(browser, 10 ) def get_url(keyword): url_str = urllib.parse.quote(keyword) i = 0 for j in range ( 100 ): yield { 'url' :( 'https://s.taobao.com/search?q={}&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=' 'a21bo.50862.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170808&sort=sale-desc&bcoffset=0&p4ppushleft=%2C44&s={}' ). format (url_str,i) } i + = 44 # 可行 def get_products(url): browser.get(url) wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item' ))) #加载所有宝贝 html = browser.page_source doc = pq(html) # print(doc) items = doc( '#mainsrp-itemlist .items .item' ).items() for item in items: product = { # 获取 image 时,用'src'总有部分图片获取不到,因为淘宝设有'data-src' 和'src',不同商品这两个属性的前后顺序不一样,直接用'data-src'可避免返回 None 'image' :item.find( '.pic .img' ).attr( 'data-src' ), #用find去获取内部元素,选择器是 pic,img,用attr获取属性 # 'image':item.find('.pic-box-inner').find('.pic').find('img').attr['src'], 'price' :item.find( '.price' ).text()[ 1 : - 3 ], # 用text获取内容 'shop_id' : item.find( '.shop' ).find( 'a' ).attr( 'data-userid' ), # 店铺 id 'data_id' : item.find( '.shop' ).find( 'a' ).attr( 'data-nid' ), # 商品 id 'link' : item.find( '.pic-box-inner' ).find( '.pic' ).find( 'a' ).attr[ 'href' ], 'deal' :item.find( '.deal-cnt' ).text()[: - 3 ], 'title' :item.find( '.title' ).text(), 'shop' :item.find( '.shop' ).text(), 'location' :item.find( '.location' ).text().replace( ' ' ,'') } # print(product) save_to_mysql(product) def save_to_mysql(product): try : conn = pymysql.connect(host = 'localhost' ,user = 'root' ,passwd = ' ' ,port = 3306 ,db = 'test1' ,charset = 'utf8' ) cur = conn.cursor() sql = "insert into women_clothes_sales2 values (%s,%s,%s,%s,%s,%s,%s,%s,%s)" cur.execute(sql,(product[ 'shop_id' ],product[ 'shop' ],product[ 'link' ],product[ 'data_id' ],product[ 'title' ],product[ 'price' ],product[ 'location' ],product[ 'deal' ],product[ 'image' ])) print ( '- - - 数据保存成功 - - - ' ) cur.close() conn.commit() conn.close() except pymysql.Error as e: print (e) def main(): keyword = '女装' links = get_url(keyword) # 字典 # 获取每页的 url for link in links: # print(link) url = link[ 'url' ] #解析页面 # soup = get_html(url) # print(soup) # get_detail(soup,url) get_products(url) if __name__ = = '__main__' : conn = pymysql.connect(host = 'localhost' ,user = 'root' ,passwd = ' ' ,db = 'test1' ,port = 3306 ,charset = 'utf8' ) cur = conn.cursor() cur.execute( 'Drop table if exists women_clothes_sales2' ) sqlc = "create table women_clothes_sales2(shop_id varchar(100),shop varchar(500),link varchar(1000),data_id varchar(100),title varchar(500),price varchar(200),location varchar(100),deal varchar(100),image varchar(1000))" cur.execute(sqlc) cur.close() conn.commit() conn.close() main() |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:http://blog.csdn.net/sisteryaya/article/details/77894521