本文实例讲述了Python正则抓取网易新闻的方法。分享给大家供大家参考,具体如下:
自己写了些关于抓取网易新闻的爬虫,发现其网页源代码与网页的评论根本就对不上,所以,采用了抓包工具得到了其评论的隐藏地址(每个浏览器都有自己的抓包工具,都可以用来分析网站)
如果仔细观察的话就会发现,有一个特殊的,那么这个就是自己想要的了
然后打开链接就可以找到相关的评论内容了。(下图为第一页内容)
接下来就是代码了(也照着大神的改改写写了)。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
#coding=utf-8 import urllib2 import re import json import time class WY(): def __init__( self ): self .headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.24 (KHTML, like ' } self .url = 'http://comment.news.163.com/data/news3_bbs/df/B9IBDHEH000146BE_1.html' def getpage( self ,page): full_url = 'http://comment.news.163.com/cache/newlist/news3_bbs/B9IBDHEH000146BE_' + str (page) + '.html' return full_url def gethtml( self ,page): try : req = urllib2.Request(page, None , self .headers) response = urllib2.urlopen(req) html = response.read() return html except urllib2.URLError,e: if hasattr (e, 'reason' ): print u "连接失败" ,e.reason return None #处理字符串 def Process( self ,data,page): if page = = 1 : data = data.replace( 'var replyData=' ,'') else : data = data.replace( 'var newPostList=' ,'') reg1 = re. compile ( " \[<a href=''>" ) data = reg1.sub( ' ' ,data) reg2 = re. compile ( '<\\\/a>\]' ) data = reg2.sub('',data) reg3 = re. compile ( '<br>' ) data = reg3.sub('',data) return data #解析json def dealJSON( self ): with open ( "WY.txt" , "a" ) as file : file .write( 'ID' + '|' + '评论' + '|' + '踩' + '|' + '顶' + '\n' ) for i in range ( 1 , 12 ): if i = = 1 : data = self .gethtml( self .url) data = self .Process(data,i)[: - 1 ] value = json.loads(data) file = open ( 'WY.txt' , 'a' ) for item in value[ 'hotPosts' ]: try : file .write(item[ '1' ][ 'f' ].encode( 'utf-8' ) + '|' ) file .write(item[ '1' ][ 'b' ].encode( 'utf-8' ) + '|' ) file .write(item[ '1' ][ 'a' ].encode( 'utf-8' ) + '|' ) file .write(item[ '1' ][ 'v' ].encode( 'utf-8' ) + '\n' ) except : continue file .close() print '--正在采集%d/12--' % i time.sleep( 5 ) else : page = self .getpage(i) data = self .gethtml(page) data = self .Process(data,i)[: - 2 ] # print data value = json.loads(data) # print value file = open ( 'WY.txt' , 'a' ) for item in value[ 'newPosts' ]: try : file .write(item[ '1' ][ 'f' ].encode( 'utf-8' ) + '|' ) file .write(item[ '1' ][ 'b' ].encode( 'utf-8' ) + '|' ) file .write(item[ '1' ][ 'a' ].encode( 'utf-8' ) + '|' ) file .write(item[ '1' ][ 'v' ].encode( 'utf-8' ) + '\n' ) except : continue file .close() print '--正在采集%d/12--' % i time.sleep( 5 ) if __name__ = = '__main__' : WY().dealJSON() |
以上就是我爬取的代码了。
希望本文所述对大家Python程序设计有所帮助。