如何获取一个网站的相关信息,获取赶集网的招聘信息,本文为大家介绍利用python获取赶集网招聘信息的关键代码,供大家参考,具体内容如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
import re import urllib import urllib.request #获取赶集网数据 def begin(url): #要伪装成的浏览器(我这个是用的chrome) headers = ( 'User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36' ) opener = urllib.request.build_opener() #将要伪装成的浏览器添加到对应的http头部 opener.addheaders = [headers] #读取相应的url data = opener. open (url).read() #将获得的html解码为utf-8 data = data.decode( 'utf-8' ) return data #处理数据,返回字典城市对应城市的url def get_cityinfo(data): city_info1 = re.findall(r 'dl>(.*?)</dl>' ,data,re.S) city_info2 = re.findall(r '(<a.*?</a>)' ,city_info1[ 0 ],re.S) city_dict = {} for each in city_info2: key = re.findall( '>(.*?)</' ,each,re.S) city_url = re.findall( 'href="(.*?)"' ,each,re.S) city_dict[key[ 0 ]] = city_url[ 0 ] return city_dict #获取所有的a标签,并从a标签中获取信息 def a_info(data): a_info = re.findall(r '(<a.*?</a>)' ,data[ 0 ],re.S) a_dict = {} for each in a_info: key = re.findall( '>(.*?)</' ,each,re.S) a_url = re.findall( '"(.*?)"' ,each,re.S) a_dict[key[ 0 ]] = a_url[ 0 ] return a_dict #获取对应城市信息的所有分类 def get_cityinfoclass(): #目前先获取招聘信息,后面有时间再继续补充完善这个模块 info = 'zhaopin/' return info #获取对应城市和其城市分类的信息 def getzhaopin(city_info,infoclass): #先搞成都的信息招聘 city_url = city_info[ '成都' ] cdzp_url = city_url + infoclass cdzp_info = begin(cdzp_url) return city_url,cdzp_info #获取成都市招聘信息 def get_zhaopin_info(city_url,cdzp_info): allzp_info = re.findall( 'class="f-all-news"(.*?)</div>' ,cdzp_info,re.S) a_dict = {} class_info = re.findall( '<dd>(.*?)</dd>' ,allzp_info[ 0 ],re.S) for each in class_info: a_info = re.findall(r '(<a.*?</a>)' ,each,re.S) for each1 in a_info: key = re.findall( '>(.*?)</' ,each1,re.S) a_url = re.findall( 'href="/(.*?)"' ,each1,re.S) a_dict[key[ 0 ].strip()] = city_url + a_url[ 0 ] return a_dict #获取招聘信息的具体内容 def get_city_zpinfo_detail(url): #先获取软件工程师 sorft_engineer = (zp_class_info[ '软件工程师' ]) job_url_info = begin(sorft_engineer) get_detail_info(job_url_info) #处理详情页的信息 def get_detail_info(list_info): job_info = re.findall( '<dl class="list-noimg job-list clearfix"(.*?)</dl' ,list_info,re.S) print (job_info[ 0 ]) if __name__ = = '__main__' : url = 'http://www.ganji.com/index.htm' ; data = begin(url); #所有城市信息 city_info = get_cityinfo(data) #对应的分类 infoclass = get_cityinfoclass() cdzp_url,xiaoshou = getzhaopin(city_info,infoclass) #获取招聘的分类信息 zp_class_info = get_zhaopin_info(cdzp_url,xiaoshou) get_city_zpinfo_detail(zp_class_info) |
以上就是本文的全部内容,希望对大家的学习有所帮助。