本文实例为大家分享了python爬虫获取小区经纬度、地址的具体代码,供大家参考,具体内容如下
通过小区名称利用百度api可以获取小区的地址以及经纬度,但是由于api返回的值中的地址形式不同,所以可以首先利用小区名称进行一轮爬虫,获取小区的经纬度,然后再利用经纬度Reverse到小区的结构化的地址。另外小区名称如果是'...号‘,可以在爬虫开始之前在'号‘之后加一个'院‘,得到的精确度更高。这次写到程序更加便于二次利用,只需要给程序传递一个dataframe就可以坐等结果了。现在程序已经写好了,就等接下来在工作中看看效果如何了。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
class GetAddressInfo: def __init__( self ,df): import pandas assert type (df) = = pandas.core.frame.DataFrame and ( 'city' in df.columns) and ( 'name' in df.columns),\ 'The dataframe is not vailid' from bs4 import BeautifulSoup from urllib import request import re import pandas as pd import numpy as np import urllib.parse as urp self .__data__ = df def get_address( self ): import numpy as np self .__data__[ '小区经度' ] = np.nan self .__data__[ '小区纬度' ] = np.nan self .__data__[ '小区地址' ] = np.nan for i in self .__data__.index: self .__data__.loc[i, '小区纬度' ], self .__data__.loc[i, '小区经度' ], self .__data__.loc[i, '小区地址' ] = \ self .__get_neigbour_address__( self .__data__.loc[i, 'name' ],\ self .__data__.loc[i, 'city' ]) return self .__data__ def __lat__( self ,res): try : return pd.to_numeric(re.findall( '"lat":(.*)' ,res)[ 0 ].split( ',' )[ 0 ]) except : return 0 def __lng__( self ,res): try : return pd.to_numeric(re.findall( '"lng":(.*)' ,res)[ 0 ]) except : return 0 def __address__( self ,res): try : return re.findall( '"address":"(.*)",' ,res)[ 0 ] except : return 'None' def __get_neigbour_address__( self ,name,city): my_ak = ##替换自己的ak qurey = urp.quote(name) tag = urp.quote( '住宅区' ) try : url = 'http://api.map.baidu.com/place/v2/search?query=' + qurey + '&tag=' + tag + '®ion=' + urp.quote(city) + '&output=json&ak=' + my_ak req = request.urlopen(url) res = req.read().decode() lat = self .__lat__(res) lng = self .__lng__(res) address = self .__address__(res) return lat,lng,address except : return 0 , 0 , 'None' class ReverseGetAddress: def __init__( self ,data): assert ( '小区纬度' in data.columns) and ( '小区经度' in data.columns) and ( 'name' in data.columns),\ 'The DataFrame is not vailid' from bs4 import BeautifulSoup from urllib import request import re import pandas as pd import numpy as np import urllib.parse as urp self .__data__ = data def __get_address1__( self ,url): try : req = request.urlopen(url) res = req.read().decode() address = re.findall( 'address":"(.*?)"' ,res)[ 0 ] return address except : return 'None1' def __to_string__( self ,arr): return str (arr) def __get_address2__( self ): my_ak = ##替换自己的Ak base_url1 = 'http://api.map.baidu.com/geocoder/v2/?callback=renderReverse' base_url2 = '&location=' base_url3 = '&pois=0&radius=1&output=json&pois=1&ak=' url = base_url1 + base_url2 + self .__data__[ '小区纬度' ]. apply ( self .__to_string__) + ',' \ + self .__data__[ '小区经度' ]. apply ( self .__to_string__) + base_url3 + my_ak return url def get_address( self ): url = self .__get_address2__() self .__data__[ '小区地址' ] = url. apply ( self .__get_address1__) return self .__data__ |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/weixin_41968760/article/details/80677954