本文实例讲述了Python查询阿里巴巴关键字排名的方法。分享给大家供大家参考。具体如下:
这里使用python库urllib及pyquery基本东西的应用,实现阿里巴巴关键词排名的查询,其中涉及到urllib代理的设置,pyquery对html文档的解析
1. urllib 基础模块的应用,通过该类获取到url中的html文档信息,内部可以重写代理的获取方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
class ProxyScrapy( object ): def __init__( self ): self .proxy_robot = ProxyRobot() self .current_proxy = None self .cookie = cookielib.CookieJar() def __builder_proxy_cookie_opener( self ): cookie_handler = urllib2.HTTPCookieProcessor( self .cookie) handlers = [cookie_handler] if PROXY_ENABLE: self .current_proxy = ip_port = self .proxy_robot.get_random_proxy() proxy_handler = urllib2.ProxyHandler({ 'http' : ip_port[ 7 :]}) handlers.append(proxy_handler) opener = urllib2.build_opener( * handlers) urllib2.install_opener(opener) return opener def get_html_body( self ,url): opener = self .__builder_proxy_cookie_opener() request = urllib2.Request(url) #request.add_header("Accept-Encoding", "gzip,deflate,sdch") #request.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") #request.add_header("Cache-Control", "no-cache") #request.add_header("Connection", "keep-alive") try : response = opener. open (request,timeout = 2 ) http_code = response.getcode() if http_code = = 200 : if PROXY_ENABLE: self .proxy_robot.handle_success_proxy( self .current_proxy) html = response.read() return html else : if PROXY_ENABLE: self .proxy_robot.handle_double_proxy( self .current_proxy) return self .get_html_body(url) except Exception as inst: print inst, self .current_proxy self .proxy_robot.handle_double_proxy( self .current_proxy) return self .get_html_body(url) |
2. 根据输入的公司名及关键词列表,返回每个关键词的排名
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
def search_keywords_rank(keyword_company_name, keywords): def get_context(url): start = clock() html = curl.get_html_body(url) finish = clock() print url,(finish - start) d = pq(html) items = d( "#J-items-content .ls-item" ) items_c = len (items) print items_c if items_c < 38 : return get_context(url) return items, items_c result = OrderedDict() for keyword in keywords: for page_index in range ( 1 , 9 ): u = url % (re.sub( '\s+' , '_' , keyword.strip()), page_index) items, items_c = get_context(u) b = False for item_index in range ( 0 , items_c): e = items.eq(item_index).find( '.title a' ) p_title = e.text() p_url = e.attr( 'href' ) e = items.eq(item_index).find( '.cright h3 .dot-product' ) company_name = e.text() company_url = e.attr( 'href' ) if keyword_company_name in company_url: total_index = (page_index - 1 ) * 38 + item_index + 1 + ( 0 if page_index = = 1 else 5 ) print 'page %s, index %s, total index %s' % (page_index, item_index + 1 , total_index) b = True if keyword not in result: result[keyword] = (p_title, p_url, page_index, item_index + 1 , total_index, u) break if b: break return result |
希望本文所述对大家的Python程序设计有所帮助。