开发工具:python3.4
操作系统:win8
主要功能:去指定小说网页爬小说目录,按章节保存到本地,并将爬过的网页保存到本地配置文件。
被爬网站:http://www.cishuge.com/
小说名称:灵棺夜行
代码出处:本人亲自码的
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
import urllib.request import http.cookiejar import socket import time import re timeout = 20 socket.setdefaulttimeout(timeout) sleep_download_time = 10 time.sleep(sleep_download_time) def makeMyOpener(head = { 'Connection' : 'Keep-Alive' , 'Accept' : 'text/html, application/xhtml+xml, */*' , 'Accept-Language' : 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3' , 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' }): cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) header = [] for key, value in head.items(): elem = (key, value) header.append(elem) opener.addheaders = header return opener def saveFile(save_path,txts): f_obj = open (save_path, 'w+' ) for item in txts: f_obj.write(item + '\n' ) f_obj.close() #get_code_list code_list = 'http://www.cishuge.com/read/0/771/' oper = makeMyOpener() uop = oper. open (code_list,timeout = 1000 ) data = uop.read().decode( 'gbk' , 'ignore' ) pattern = re. compile ( '<li><a href="(.*?)".*?>(.*?)</a></li>' ,re.S) items = re.findall(pattern,data) print ( '获取列表完成' ) url_path = 'url_file.txt' url_r = open (url_path, 'r' ) url_arr = url_r.readlines( 100000 ) url_r.close() print ( len (url_arr)) url_file = open (url_path, 'a' ) print ( '获取已下载网址' ) for tmp in items: save_path = tmp[ 1 ].replace( ' ' ,' ')+' .txt' url = code_list + tmp[ 0 ] if url + '\n' in url_arr: continue print ( '写日志:' + url + '\n' ) url_file.write(url + '\n' ) opene = makeMyOpener() op1 = opene. open (url,timeout = 1000 ) data = op1.read().decode( 'gbk' , 'ignore' ) opene.close() pattern = re. compile ( ' (.*?)<br />' ,re.S) txts = re.findall(pattern,data) saveFile(save_path,txts) url_file.close() |
虽然代码还是有点瑕疵,还是分享给大家,一起改进