本文实例讲述了Python基于pandas实现json格式转换成dataframe的方法。分享给大家供大家参考,具体如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
# -*- coding:utf-8 -*- #!python3 import re import json from bs4 import BeautifulSoup import pandas as pd import requests import os from pandas.io.json import json_normalize class image_structs(): def __init__( self ): self .picture_url = { "image_id" : '', "picture_url" : '' } class data_structs(): def __init__( self ): # columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment']) self .info = { "title" :'', "item_url" :'', "id" : 0 , "picture_url" :[], "std_desc" :'', "description" :'', "information" :'', "fitment" :'' } # "https://waldoch.com/store/catalogsearch/result/index/?cat=0&limit=200&p=1&q=nerf+bar" # https://waldoch.com/store/new-oem-ford-f-150-f150-5-running-boards-nerf-bar-crew-cab-2015-w-brackets-fl34-16451-ge5fm6.html def get_item_list(outfile): result = [] for i in range ( 6 ): print (i) i = str (i + 1 ) url = "https://waldoch.com/store/catalogsearch/result/index/?cat=0&limit=200&p=" + i + "&q=nerf+bar" web = requests.get(url) soup = BeautifulSoup(web.text, "html.parser" ) alink = soup.find_all( "a" , class_ = "product-image" ) for a in alink: title = a[ "title" ] item_url = a[ "href" ] result.append([title,item_url]) df = pd.DataFrame(result,columns = [ "title" , "item_url" ]) df = df.drop_duplicates() df[ "id" ] = df.index df.to_excel(outfile,index = False ) def get_item_info( file ,outfile): DEFAULT_FALSE = "" df = pd.read_excel( file ) for i in df.index: id = df.loc[i, "id" ] if os.path.exists( str ( int ( id )) + ".xlsx" ): continue item_url = df.loc[i, "item_url" ] url = item_url web = requests.get(url) soup = BeautifulSoup(web.text, "html.parser" ) # 图片 imglink = soup.find_all( "img" , class_ = re. compile ( "^gallery-image" )) data = data_structs() data.info[ "title" ] = df.loc[i, "title" ] data.info[ "id" ] = id data.info[ "item_url" ] = item_url for a in imglink: image = image_structs() image.picture_url[ "image_id" ] = a[ "id" ] image.picture_url[ "picture_url" ] = a[ "src" ] print (image.picture_url) data.info[ "picture_url" ].append(image.picture_url) print (data.info) # std_desc std_desc = soup.find( "div" , itemprop = "description" ) try : strings_desc = [] for ii in std_desc.stripped_strings: strings_desc.append(ii) strings_desc = "\n" .join(strings_desc) except : strings_desc = DEFAULT_FALSE # description try : desc = soup.find( 'h2' , text = "Description" ) desc = desc.find_next() except : desc = DEFAULT_FALSE description = desc # information try : information = soup.find( "h2" , text = 'Information' ) desc = information desc = desc.find_next() except : desc = DEFAULT_FALSE information = desc # fitment try : fitment = soup.find( 'h2' , text = 'Fitment' ) desc = fitment desc = desc.find_next() except : desc = DEFAULT_FALSE fitment = desc data.info[ "std_desc" ] = strings_desc data.info[ "description" ] = str (description) data.info[ "information" ] = str (information) data.info[ "fitment" ] = str (fitment) print (data.info.keys()) singledf = json_normalize(data.info, "picture_url" ,[ 'title' , 'item_url' , 'id' , 'std_desc' , 'description' , 'information' , 'fitment' ]) singledf.to_excel( "test.xlsx" ,index = False ) exit() # print(df.ix[i]) df.to_excel(outfile,index = False ) # get_item_list("item_urls.xlsx") get_item_info( "item_urls.xlsx" , "item_urls_info.xlsx" ) |
这里涉及到的几个Python模块都可以使用pip install命令进行安装,如:
1
|
pip install BeautifulSoup4 |
1
|
pip install xlrd |
1
|
pip install openpyxl |
希望本文所述对大家Python程序设计有所帮助。
原文链接:https://blog.csdn.net/zn505119020/article/details/78964111