本文实例为大家分享了python下载微信公众号相关文章的具体代码,供大家参考,具体内容如下
目的:从零开始学自动化测试公众号中下载“pytest"一系列文档
1、搜索微信号文章关键字搜索
2、对搜索结果前n页进行解析,获取文章标题和对应url
主要使用的是requests和bs4中的beautifulsoup
weixin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
import requests from urllib.parse import quote from bs4 import beautifulsoup import re from weixinspider.html2doc import myhtmlparser class weixinspider( object ): def __init__( self , gzh_name, pageno,keyword): self .gzh_name = gzh_name self .pageno = pageno self .keyword = keyword.lower() self .page_url = [] self .article_list = [] self .headers = { 'user-agent' : 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.110 safari/537.36' } self .timeout = 5 # [...] 用来表示一组字符,单独列出:[amk] 匹配 'a','m'或'k' # re+ 匹配1个或多个的表达式。 self .pattern = r '[\\/:*?"<>|\r\n]+' def get_page_url( self ): for i in range ( 1 , self .pageno + 1 ): # https://weixin.sogou.com/weixin?query=从零开始学自动化测试&_sug_type_=&s_from=input&_sug_=n&type=2&page=2&ie=utf8 url = "https://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=2&page=%s&ie=utf8" \ % (quote( self .gzh_name),i) self .page_url.append(url) def get_article_url( self ): article = {} for url in self .page_url: response = requests.get(url,headers = self .headers,timeout = self .timeout) result = beautifulsoup(response.text, 'html.parser' ) articles = result.select( 'ul[class="news-list"] > li > div[class="txt-box"] > h3 > a ' ) for a in articles: # print(a.text) # print(a["href"]) if self .keyword in a.text.lower(): new_text = re.sub( self .pattern,"",a.text) article[new_text] = a[ "href" ] self .article_list.append(article) headers = { 'user-agent' : 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.110 safari/537.36' } timeout = 5 gzh_name = 'pytest文档' my_gzh = weixinspider(gzh_name, 5 , 'pytest' ) my_gzh.get_page_url() # print(my_gzh.page_url) my_gzh.get_article_url() # print(my_gzh.article_list) for article in my_gzh.article_list: for (key,value) in article.items(): url = value html_response = requests.get(url,headers = headers,timeout = timeout) myhtmlparser = myhtmlparser(key) myhtmlparser.feed(html_response.text) myhtmlparser.doc.save(myhtmlparser.docfile) |
html2doc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
from html.parser import htmlparser import requests from docx import document import re from docx.shared import rgbcolor import docx class myhtmlparser(htmlparser): def __init__( self ,docname): htmlparser.__init__( self ) self .docname = docname self .docfile = r "d:\pytest\%s.doc" % self .docname self .doc = document() self .title = false self .code = false self .text = '' self .processing = none self .codeprocessing = none self .picindex = 1 self .headers = { 'user-agent' : 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.110 safari/537.36' } self .timeout = 5 def handle_startendtag( self , tag, attrs): # 图片的处理比较复杂,首先需要找到对应的图片的url,然后下载并写入doc中 if tag = = "img" : if len (attrs) = = 0 : pass else : for (variable, value) in attrs: if variable = = "data-type" : picname = r "d:\pytest\%s%s.%s" % ( self .docname, self .picindex, value) # print(picname) if variable = = "data-src" : picdata = requests.get(value, headers = self .headers, timeout = self .timeout) # print(value) self .picindex = self .picindex + 1 # print(self.picindex) with open (picname, "wb" ) as pic: pic.write(picdata.content) try : self .doc.add_picture(picname) except docx.image.exceptions.unexpectedendoffileerror as e: print (e) def handle_starttag( self , tag, attrs): if re.match(r "h(\d)" , tag): self .title = true if tag = = "p" : self .processing = tag if tag = = "code" : self .code = true self .codeprocessing = tag def handle_data( self , data): if self .title = = true: self .doc.add_heading(data, level = 2 ) # if self.in_div == true and self.tag == "p": if self .processing: self .text = self .text + data if self .code = = true: p = self .doc.add_paragraph() run = p.add_run(data) run.font.color.rgb = rgbcolor( 111 , 111 , 111 ) def handle_endtag( self , tag): self .title = false # self.code = false if tag = = self .processing: self .doc.add_paragraph( self .text) self .processing = none self .text = '' if tag = = self .codeprocessing: self .code = false |
运行结果:
缺少部分文档,如pytest文档4,是因为搜狗微信文章搜索结果中就没有
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/yaoliuwei1426/article/details/84707163