python书籍信息爬虫实例_Python

python书籍信息爬虫示例，供大家参考，具体内容如下

背景说明

需要收集一些书籍信息，以豆瓣书籍条目作为源，得到一些有效书籍信息，并保存到本地数据库。

获取书籍分类标签

具体可参考这个链接：
https://book.douban.com/tag/?view=type

然后将这些分类标签链接存到本地某个文件，存储内容如下

									https://book.douban.com/tag/小说

									https://book.douban.com/tag/外国文学

									https://book.douban.com/tag/文学

									https://book.douban.com/tag/随笔

									https://book.douban.com/tag/中国文学

									https://book.douban.com/tag/经典

									https://book.douban.com/tag/日本文学

									https://book.douban.com/tag/散文

									https://book.douban.com/tag/村上春树

									https://book.douban.com/tag/诗歌

									https://book.douban.com/tag/童话

									......

获取书籍信息，并保存本地数据库

假设已经建好mysql表，如下：

									CREATE TABLE `book_info` (

									 `id` int(11) NOT NULL AUTO_INCREMENT,

									 `bookid` varchar(64) NOT NULL COMMENT 'book ID',

									 `tag` varchar(32) DEFAULT '' COMMENT '分类目录',

									 `bookname` varchar(256) NOT NULL COMMENT '书名',

									 `subname` varchar(256) NOT NULL COMMENT '二级书名',

									 `author` varchar(256) DEFAULT '' COMMENT '作者',

									 `translator` varchar(256) DEFAULT '' COMMENT '译者',

									 `press` varchar(128) DEFAULT '' COMMENT '出版社',

									 `publishAt` date DEFAULT '0000-00-00' COMMENT '出版日期',

									 `stars` float DEFAULT '0' COMMENT '评分',

									 `price_str` varchar(32) DEFAULT '' COMMENT '价格string',

									 `hotcnt` int(11) DEFAULT '0' COMMENT '评论人数',

									 `bookdesc` varchar(8192) DEFAULT NULL COMMENT '简介',

									 `updateAt` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改日期',

									 PRIMARY KEY (`id`),

									 UNIQUE KEY `idx_bookid` (`bookid`),

									 KEY `idx_bookname` (`bookname`),

									 KEY `hotcnt` (`hotcnt`),

									 KEY `stars` (`stars`),

									 KEY `idx_tag` (`tag`)

									) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='书籍信息';

并已实现相关爬虫逻辑，主要用到了BeautifulSoup包，如下：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

									#!/usr/bin/python

									# coding: utf-8

									import re

									import logging

									import requests

									import pymysql

									import random

									import time

									import datetime

									from hashlib import md5

									from bs4 import BeautifulSoup

									logging.basicConfig(level=logging.INFO,

									     format='[%(levelname)s][%(name)s][%(asctime)s]%(message)s',

									     datefmt='%Y-%m-%d %H:%M:%S')

									class DestDB:

									 Host = "192.168.1.10"

									 DB = "spider"

									 Table = "book_info"

									 User = "test"

									 Pwd = "123456"

									def connect_db(host, db, user, pwd):

									 conn = pymysql.connect(

									  host=host,

									  user=user,

									  passwd=pwd,

									  db=db,

									  charset='utf8',

									  connect_timeout=3600) #,

									#  cursorclass=pymysql.cursors.DictCursor)

									 conn.autocommit(True)

									 return conn

									def disconnect_db(conn, cursor):

									 cursor.close()

									 conn.close()

									#提取评价人数，如果评价人数少于10人，按10人处理

									def hotratings(person):

									 try:

									  ptext = person.get_text().split()[0]

									  pc = int(ptext[1:len(ptext)-4])

									 except ValueError:

									  pc = int(10)

									 return pc

									# 持久化到数据库

									def save_to_db(tag, book_reslist):

									 dest_conn = connect_db(DestDB.Host, DestDB.DB, DestDB.User, DestDB.Pwd)

									 dest_cursor = dest_conn.cursor()

									 isql = "insert ignore into book_info "

									 isql += "(`bookid`,`tag`,`author`,`translator`,`bookname`,`subname`,`press`,"

									 isql += "`publishAt`,`price_str`,`stars`,`hotcnt`,`bookdesc`) values "

									 isql += ",".join(["(%s)" % ",".join(['%s']*12)]*len(book_reslist))

									 values = []

									 for row in book_reslist:

									  # 暂时将md5(bookname+author)作为bookid唯一指

									  bookid = md5(("%s_%s"%(row[0],row[2])).encode('utf-8')).hexdigest()

									  values.extend([bookid, tag]+row[:10])

									 dest_cursor.execute(isql, tuple(values))

									 disconnect_db(dest_conn, dest_cursor)

									# 处理每一次访问的页面

									def do_parse(tag, url):

									 page_data = requests.get(url)

									 soup = BeautifulSoup(page_data.text.encode("utf-8"), "lxml")

									 # 提取标签信息

									 tag = url.split("?")[0].split("/")[-1]

									 # 抓取作者，出版社信息

									 details = soup.select("#subject_list > ul > li > div.info > div.pub")

									 # 抓取评分

									 scores = soup.select("#subject_list > ul > li > div.info > div.star.clearfix > span.rating_nums")

									 # 抓取评价人数

									 persons = soup.select("#subject_list > ul > li > div.info > div.star.clearfix > span.pl")

									 # 抓取书名

									 booknames = soup.select("#subject_list > ul > li > div.info > h2 > a")

									 # 抓取简介 

									 descs = soup.select("#subject_list > ul > li > div.info > p")

									 # 从标签信息中分离内容

									 book_reslist = []

									 for detail, score, personCnt, bookname, desc in zip(details, scores, persons, booknames, descs):

									  try:

									   subtitle = ""

									   title_strs = [s.replace('\n', '').strip() for s in bookname.strings]

									   title_strs = [s for s in title_strs if s]

									   # 部分书籍有二级书名

									   if not title_strs:

									    continue

									   elif len(title_strs) >= 2:

									    bookname, subtitle = title_strs[:2]

									   else:

									    bookname = title_strs[0]

									   # 评分人数

									   hotcnt = hotratings(personCnt)

									   desc = desc.get_text()

									   stars = float('%.1f' % float(score.get_text() if score.get_text() else "-1"))

									   author, translator, press, publishAt, price = [""]*5

									   detail_texts = detail.get_text().replace('\n', '').split("/")

									   detail_texts = [s.strip() for s in detail_texts]

									   # 部分书籍无译者信息

									   if len(detail_texts) == 4:

									    author, press, publishAt, price = detail_texts[:4]

									   elif len(detail_texts) >= 5:

									    author, translator, press, publishAt, price = detail_texts[:5]

									   else:

									    continue

									   # 转换出版日期为date类型

									   if re.match('^[\d]{4}-[\d]{1,2}', publishAt):

									    dts = publishAt.split('-')

									    publishAt = datetime.date(int(dts[0]), int(dts[1]), 1)

									   else:

									    publishAt = datetime.date(1000, 1, 1)

									   book_reslist.append([author, translator, bookname, subtitle, press, 

									         publishAt, price, stars, hotcnt, desc])

									  except Exception as e:

									   logging.error(e)

									 logging.info("insert count: %d" % len(book_reslist))

									 if len(book_reslist) > 0:

									  save_to_db(tag, book_reslist)

									  book_reslist = []

									 return len(details)

									def main():

									 with open("book_tags.txt") as fd:

									  tags = fd.readlines()

									  for tag in tags:

									   tag = tag.strip()

									   logging.info("current tag url: %s" % tag)

									   for idx in range(0, 1000000, 20):

									    try:

									     url = "%s?start=%d&type=T" % (tag.strip(), idx)

									     cnt = do_parse(tag.split('/')[-1], url)

									     if cnt < 10:

									      break

									     # 睡眠若干秒，降低访问频率

									     time.sleep(random.randint(10, 15))

									    except Exception as e:

									     logging.warn("outer_err: %s" % e)

									   time.sleep(300)

									if __name__ == "__main__":

									 main()

小结

以上代码基于python3环境来运行；
需要首先安装BeautifulSoup: pip install bs4
爬取过程中需要控制好访问频率；
需要对一些信息进行异常处理，比如译者信息、评论人数等。

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持服务器之家。

原文链接：http://blog.csdn.net/moxiaomomo/article/details/79023873

python书籍信息爬虫实例

相关文章

热门资讯