python3通过selenium爬虫获取到dj商品的实例代码_Python

先给大家介绍下python3 selenium使用

其实这个就相当于模拟人的点击事件来连续的访问浏览器。如果你玩过王者荣耀的话在2016年一月份的版本里面就有一个bug。

安卓手机下载一个按键精灵就可以在冒险模式里面设置按键，让手机自动玩闯关，一局19个金币，一晚上就一个英雄了。不过

程序员也不是吃素的。给一个星期设置了大概4000金币上限。有兴趣的可以去试试。（注：手机需要root）

进入正题：

				?

									from selenium import webdriver

									from selenium.webdriver.common.by import by

									from selenium.webdriver.common.keys import keys

									from selenium.webdriver.support import expected_conditions as ec

									from selenium.webdriver.support.wait import webdriverwait

在写之前需要下载selenium模块

				?

									brguge=webdriver.chrome()#声明驱动对象

									try:

									  brguge.get('https://www.baidu.com')#发送get请求

									  input=brguge.find_element_by_id('kw')#找到目标

									  input.send_keys('python')#输入python关键字

									  input.send_keys(keys.enter)#敲入回车

									  wait=webdriverwait(brguge,10)#等待元素加载出来

									  wait.until(ec.presence_of_element_located(by.id,'content_left'))#加载

									  print(brguge.current_url)#输出搜索的路径

									  print(brguge.get_cookie())#输出cookie

									  print(brguge.page_source)#输出结果源代码

									finally:

									  brguge.close()#关闭谷歌浏览器

下面是一些selenium模块的基本用法

查找元素

单个元素

				?

									(from selenium import webdriver)

									    brguge.find_element_by_id('q')用这个元素找id是q的元素

									    brguge.find_element_by_css_selector('#q')找css样式是q的

									    brguge.find_element_by_xpath('//*[ @id="q"]')三个效果一样

									    brguge.find_element_by_name()通过name来查找

									    brguge.find_element_by_link_text()通过link来查找

									    brguge.find_element_by_partial_link_text()

									    brguge.find_element_by_tag_name()

									    brguge.find_element_by_class_name()通过class查找

									    from selenium import webdriver

									    from selenium.webdriver.common.by import by

									    brguge.find_element(by.id,'q')通用查找方式

    多个元素（find_elements）加了个s
        他会以列表的形式打印出来
        brguge.find_elements_by_css_selector('.service-bd li')css样式为li的元素
        brguge.find_elements（by.css_selector,'.service-bd li'）两个作用一样
        (利用索引就可以获取单个或多个元素了)
    元素交互操作（获取元素然后再给他指令）
        选择输入框 --》send_keys('输入文字')--》clear()清空输入框--在输入别的--》找到搜索--》click(点击)
        input.clear()清空按钮
    交互动作（将动作附加到动作链中串行执行）
        switch_to_frame('iframeresult')
        用css样式分别找到两个要交互
        调用actionchains(调用谷歌的)
        drag_and_drop(source,target)第一个到第二个上面
        perform()

下面看下python3通过selenium爬虫获取到dj商品的实例代码。

具体代码如下所示：

				?

									from selenium import webdriver

									from selenium.webdriver.common.by import by

									from selenium.webdriver.common.keys import keys

									from selenium.webdriver.support.wait import webdriverwait

									from selenium.webdriver.support import expected_conditions as ec

									from selenium.webdriver.chrome.options import options

									from selenium.common.exceptions import nosuchelementexception

									from lxml import etree

									import time, json

									jd_url_login = "https://www.jd.com/"

									class customizeexception(exception):

									  def __init__(self, status, msg):

									    self.status = status

									    self.msg = msg

									class jd:

									  def __init__(self):

									    self.browser = none

									    self.__init_browser()

									  def __init_browser(self):

									    options = options()

									    options.add_argument("--headless")

									    options.add_experimental_option('excludeswitches', ['enable-automation'])

									    # 设置为无图模式

									    options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})

									    self.browser = webdriver.chrome(options=options)

									    # 设置浏览器最大化窗口

									    self.browser.maximize_window()

									    # 隐式等待时间为3s

									    self.browser.implicitly_wait(3)

									    self.browser.get(jd_url_login)

									    self.wait = webdriverwait(self.browser, 10)

									  def __search_goods(self, goods):

									    '''搜索商品的方法'''

									    self.file = open("jd-{}.json".format(goods), "a", encoding="utf-8")

									    self.wait.until(ec.presence_of_all_elements_located((by.id, "key")))

									    serach_input = self.browser.find_element_by_id("key")

									    serach_input.clear()

									    serach_input.send_keys(goods, keys.enter)

									  def __get_goods_info(self, page_source):

									    '''从网页源码中获取到想要的数据'''

									    selector_html = etree.html(page_source)

									    # 商品名字 不要获取title属性，以后再改吧，最好是获取到商品名的文本内容

									    goods_name = selector_html.xpath("//div[@class='gl-i-wrap']//div[contains(@class,'p-name')]/a/@title")

									    # 商品价格

									    goods_price = selector_html.xpath("//div[@class='gl-i-wrap']//div[@class='p-price']/strong/i/text()")

									    # 商品评价数量

									    comment_num_selector = selector_html.xpath("//div[@class='p-commit']/strong")

									    comment_num = [selector.xpath("string(.)") for selector in comment_num_selector]

									    # 商品店铺

									    shop_name = selector_html.xpath("//a[@class='curr-shop']/text()")

									    goods_zip = zip(goods_name, goods_price, comment_num, shop_name)

									    for goods_info in goods_zip:

									      dic = {}

									      dic["goods_name"] = goods_info[0]

									      dic["goods_price"] = goods_info[1]

									      dic["comment_num"] = goods_info[2]

									      dic["shop_name"] = goods_info[3]

									      # print("商品名字>>:", goods_info[0])

									      # print("商品价格>>:", goods_info[1])

									      # print("商品评价数量>>:", goods_info[2])

									      # print("商品店铺>>:", goods_info[3])

									      # print("*" * 100)

									      yield dic

									  def __swipe_page(self):

									    '''上下滑动页面，将完整的网页源码返回'''

									    height = self.browser.execute_script("return document.body.scrollheight;")

									    js = "window.scrollto(0, {});".format(height)

									    self.browser.execute_script(js)

									    while true:

									      time.sleep(1)

									      now_height = self.browser.execute_script("return document.body.scrollheight;")

									      if height == now_height:

									        return self.browser.page_source

									      js = "window.scrollto({}, {});".format(height, now_height)

									      self.browser.execute_script(js)

									      height = now_height

									  def __is_element_exists(self, xpath):

									    '''检测一个xpath是否能够找到'''

									    try:

									      self.browser.find_element_by_xpath(xpath=xpath)

									      return true

									    except nosuchelementexception:

									      return false

									  def __click_next_page(self):

									    '''点击下一页，实现翻页功能'''

									    self.wait.until(ec.presence_of_all_elements_located((by.class_name, "pn-next")))

									    xpath = "//a[@class='pn-next']"

									    if not self.__is_element_exists(xpath):

									      raise customizeexception(10000, "该商品访问完毕")

									    self.browser.find_element_by_xpath(xpath).click()

									  def __write_to_json(self, dic: dict):

									    data_json = json.dumps(dic, ensure_ascii=false)

									    self.file.write(data_json + "\n")

									  def run(self, goods):

									    self.__search_goods(goods)

									    n = 1

									    while true:

									      print("正在爬取商品 <{}>---第{}页......".format(goods, n))

									      time.sleep(3)

									      html = self.__swipe_page()

									      for dic in self.__get_goods_info(html):

									        self.__write_to_json(dic)

									      try:

									        self.__click_next_page()

									      except customizeexception:

									        try:

									          goods = goods_list.pop(0)

									          self.run(goods)

									        except indexerror:

									          return

									      n += 1

									  def __del__(self):

									    self.browser.close()

									    self.file.close()

									if __name__ == '__main__':

									  jd = jd()

									  goods_list = ["纯牛奶", "酸奶", "奶茶", "床上用品", "电磁炉", "电视", "小米笔记本", "华硕笔记本", "联想笔记本", "男士洗面奶", "女士洗面奶", "沐浴露", "洗发露",

									         "牙刷", "牙膏", "拖鞋", "剃须刀", "水手服", "运动服", "红龙果", "苹果", "香蕉", "洗衣液", "电饭煲"]

									  try:

									    goods = goods_list.pop(0)

									  except indexerror:

									    raise customizeexception(20000, "goods_list不能为空")

									  try:

									    jd.run(goods)

									  finally:

									    del jd

总结

以上所述是小编给大家介绍的python3通过selenium爬虫获取到dj商品的实例代码,希望对大家有所帮助，如果大家有任何疑问请给我留言，小编会及时回复大家的。在此也非常感谢大家对服务器之家网站的支持！

如果你觉得本文对你有帮助，欢迎转载，烦请注明出处，谢谢！

原文链接：https://www.cnblogs.com/zhuchunyu/archive/2019/04/25/10765875.html

python3通过selenium爬虫获取到dj商品的实例代码

相关文章

热门资讯