本文实例讲述了Python实现登录人人网并抓取新鲜事的方法。分享给大家供大家参考。具体如下:
这里演示了Python登录人人网并抓取新鲜事的方法(抓取后的排版不太美观~~)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
from sgmllib import SGMLParser import sys,urllib2,urllib,cookielib class spider(SGMLParser): def __init__( self ,email,password): SGMLParser.__init__( self ) self .h3 = False self .h3_is_ready = False self .div = False self .h3_and_div = False self .a = False self .depth = 0 self .names = "" self .dic = {} self .email = email self .password = password self .domain = 'renren.com' try : cookie = cookielib.CookieJar() cookieProc = urllib2.HTTPCookieProcessor(cookie) except : raise else : opener = urllib2.build_opener(cookieProc) urllib2.install_opener(opener) def login( self ): url = 'http://www.renren.com/PLogin.do' postdata = { 'email' : self .email, 'password' : self .password, 'domain' : self .domain } req = urllib2.Request( url, urllib.urlencode(postdata) ) self . file = urllib2.urlopen(req).read() #print self.file def start_h3( self ,attrs): self .h3 = True def end_h3( self ): self .h3 = False self .h3_is_ready = True def start_a( self ,attrs): if self .h3 or self .div: self .a = True def end_a( self ): self .a = False def start_div( self ,attrs): if self .h3_is_ready = = False : return if self .div = = True : self .depth + = 1 for k,v in attrs: if k = = 'class' and v = = 'content' : self .div = True ; self .h3_and_div = True #h3 and div is connected def end_div( self ): if self .depth = = 0 : self .div = False self .h3_and_div = False self .h3_is_ready = False self .names = "" if self .div = = True : self .depth - = 1 def handle_data( self ,text): #record the name if self .h3 and self .a: self .names + = text #record says if self .h3 and ( self .a = = False ): if not text: pass else : self .dic.setdefault( self .names,[]).append(text) return if self .h3_and_div: self .dic.setdefault( self .names,[]).append(text) def show( self ): type = sys.getfilesystemencoding() for key in self .dic: print ( (' '.join(key)).replace(' ',' ')).decode(' utf - 8 ').encode( type ), \ ( (' '.join(self.dic[key])).replace(' ',' ')).decode(' utf - 8 ').encode( type ) renrenspider = spider( 'your email' , 'your password' ) renrenspider.login() renrenspider.feed(renrenspider. file ) renrenspider.show() |
希望本文所述对大家的Python程序设计有所帮助。