本文实例为大家分享了python爬取微信公众号文章的具体代码,供大家参考,具体内容如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
|
# -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup from requests.exceptions import RequestException import time import random import MySQLdb import threading import socket import math socket.setdefaulttimeout( 60 ) #这里对整个socket层设置超时时间。后续文件中如果再使用到socket,不必再设置 glock = threading.Lock() #定义全局锁 CATEGORY_URL = [ 'http://www.we123.com/gzh/onclick/' ] #获取地区分类链接 all_url = [] # ALL_URLS = [] #所有详细页面链接 proxy_list = [] #IP池 URL = 'http://www.we123.com' PAGE_URL = [] #所有分页链接 #获取Ip池 def get_ip(): headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } url = 'http://http-webapi.zhimaruanjian.com' #可以使用芝麻代理,好用稳定还不贵 resp = requests.get(url,headers = headers) obj = resp.json() #获取json ip池对象 for ip in obj: arr = 'http://' + str (ip[ 'ip' ]) + ':' + str (ip[ 'port' ]) proxy_list.append(arr) #获取页面源码函数 def get_html(url): # headers = {} user_agent_list = [ 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3538.400 QQBrowser/9.6.12501.400' , 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' , 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0' ] # user_agent = random.choice(user_agent_list) headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3538.400 QQBrowser/9.6.12501.400' } # 代理,免费的代理只能维持一会可能就没用了,自行更换 # proxy_list = [ # "http://27.192.185.62:3252", # ] # proxy_ip = random.choice(proxy_list) # proxies = {'http': proxy_ip} # print(str(url)) try : resp = requests.get(url,headers = headers) # print("72行:status_code = " + str(resp.status_code)) # print(type(resp.text)) # print(resp.url) # 请求的url if resp.status_code = = 200 : return resp elif resp.status_code = = 404 : return resp elif resp.status_code = = 500 : return resp return resp except RuntimeError: print ( "超时" ) return "error" except ConnectionError: print ( "连接超时" ) return "error" except RequestException: print ( "http请求父类错误" ) with open ( 'url_exception.txt' , 'a+' , encoding = 'utf-8' ) as f: f.write( str (url)) f.write( '\n' ) return "error" #获取区域分类链接 def get_categoty_url(): url = 'http://www.we123.com/gzh/onclick/' resp = get_html(url) soup = BeautifulSoup(resp.text, 'lxml' ) html = soup.select( 'div.div-subs2 > div.divst-content > div.divst-subs > li > a' ) # 获取区域分类链接 for i in html: city = i[ 'href' ].split( "/" )[ - 1 ] if (city = = '海外' or city = = '台湾' or city = = '澳门' ): continue url = URL + i[ 'href' ] CATEGORY_URL.append(url) print (CATEGORY_URL) #获取每个区域下所有分页链接 def get_page_url(url): city = url.split( '/' )[ - 1 ] html = get_html(url) if html = = "error" : print ( "98行:connect url error" ) time.sleep(random.randint( 10 , 20 )) return "error" soup = BeautifulSoup(html.text, 'lxml' ) #获取总条数 all_nums = soup.select( "div.page > a > b" ) if len (all_nums) = = 0 : return "error" else : all_nums = soup.select( "div.page > a > b" )[ 0 ].get_text() #获取总分页数 all_pages = math.ceil(( int (all_nums) / 30 )) #获取所有分页链接 all_page_url = [] for i in range ( 0 , int (all_pages)): page_url = 'http://www.we123.com/e/action/ListInfo.php?page=' + str (i) + '&classid=45&line=30&tempid=10&orderby=onclick&myorder=0&totalnum=' + str (all_nums) all_page_url.append(page_url) return all_page_url # 获取所有详细页面链接 def get_page_urls(): global PAGE_URL c_url = CATEGORY_URL.pop() print ( '121 行:请求链接' + c_url) PAGE_URL = get_page_url(c_url) #获取每个区域下面的所有分页链接 # 获取所有详细页面链接 def get_info_urls(): while True : global PAGE_URL #设置全局变量 glock.acquire() #加锁 if len (PAGE_URL) = = 0 : glock.release() #解锁 print ( '131 行:CATEGORY_URL 为空' ) break else : p_url = PAGE_URL.pop() print ( '135 行:请求链接' + p_url) glock.release() #解锁 glock.acquire() #加锁 html = get_html(p_url) if html = = "error" : print ( "141行:connect url error" ) time.sleep( 2 ) return soup = BeautifulSoup(html.text, 'lxml' ) info_urls = soup.select( 'div.gzhRight > div.gzh_list > ul > li > a' ) for x in info_urls: i_url = URL + x[ 'href' ] ALL_URLS.append(i_url) print ( "库存链接共:" + str ( len (ALL_URLS))) glock.release() #解锁 #获取每一页需要的数据 def get_data(): while True : global ALL_URLS #设置全局变量 glock.acquire() #加锁 print ( "当前库存:" + str ( len (ALL_URLS))) if len (ALL_URLS) = = 0 : glock.release() #解锁 print ( '159 行 :ALL_URLS 为空' ) break else : url = ALL_URLS.pop() print ( "开始抓取数据:" + url) glock.release() #解锁 time.sleep( 1 ) #睡眠1秒钟 html = get_html(url) if html = = "error" : print ( "168行:connect url error" ) time.sleep(random.randint( 2 , 4 )) return html.encoding = 'utf-8' #显式地指定网页编码,一般情况可以不用 soup = BeautifulSoup(html.text, 'lxml' ) #公众号名称 names = soup.select( 'div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.gzhtop_logo > h1' ) #微信号id accounts = [] accounts.append(soup.select( 'div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.gzhtop_logo > p' )[ 0 ]) #微信头像 imgs = soup.select( 'div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.gzhtop_logo > img' ) #公众号二维码 QR_codes = soup.select( 'div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_right > img' ) #介绍 descs = soup.select( 'div.artcleLeft > div.xcxnry > div.xcxinfo' ) #公众号分类 categorys = [] category = '' cate = soup.select( 'div.artcleLeft > div.xcxnry > div.xcxtop > div.xcxtop_left > div.xcx_p > span > a' ) if not len (cate) = = 0 : category = cate[ 0 ].get_text() else : category = '综合' glock.acquire() #加锁 for name,account,img,QR_code,desc in zip (names,accounts,imgs,QR_codes,descs): data = { 'name' :name.get_text(), 'category' :category, 'account' :account.get_text().split( ":" )[ - 1 ], 'img' :img[ 'src' ], 'QR_code' :QR_code[ 'src' ], 'desc' :desc.get_text() } add_data(data,url) glock.release() #解锁 #添加数据 def add_data(data,url): con = MySQLdb.connect( '127.0.0.1' , 'root' , 'root' , 'test' ,charset = "utf8" ,use_unicode = True ) cursor = con.cursor() # exit() insert_sql = """ insert ignore into weixin5(w_name,category,account,img,QR_code,introduce) VALUES (%s,%s,%s,%s,%s,%s) """ print ( '212行 :' + data[ 'name' ] + '_' + data[ 'account' ] + '添加成功!-' + url) try : cursor.execute(insert_sql,(data[ 'name' ],data[ 'category' ],data[ 'account' ],data[ 'img' ],data[ 'QR_code' ], str (data[ 'desc' ]))) con.commit() except : ALL_URLS.insert( 0 ,url) print ( "218行:" + URL + '插入失败' ) con.rollback() con.close() # 将时间字符串转化为时间戳 def time_to(dt): timeArray = time.strptime(dt, "%Y年%m月%d日" ) timestamp = int (time.mktime(timeArray)) return timestamp #启动多线程爬取 def main(): for x in range ( 3 ): th = threading.Thread(target = get_info_urls) th.start() # get_info_urls() time.sleep( 3 ) for x in range ( 5 ): th = threading.Thread(target = get_data) th.start() if __name__ = = '__main__' : # 计时 t1 = time.time() # 调用函数 get_ip() #获取ip池 get_page_urls() time.sleep( 2 ) # get_categoty_url() main() print (time.time() - t1) |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/qq_32364939/article/details/78442243