需求:
现有一个 csv文件,包含'CNUM'和'COMPANY'两列,数据里包含空行,且有内容重复的行数据。
要求:
1)去掉空行;
2)重复行数据只保留一行有效数据;
3)修改'COMPANY'列的名称为'Company_New‘;
4)并在其后增加六列,分别为'C_col',‘D_col',‘E_col',‘F_col',‘G_col',‘H_col'。
一,使用 Python Pandas来处理:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
import pandas as pd import numpy as np from pandas import DataFrame,Series def deal_with_data(filepath,newpath): file_obj = open (filepath) df = pd.read_csv(file_obj) # 读取csv文件,创建 DataFrame df = df.reindex(columns = [ 'CNUM' , 'COMPANY' , 'C_col' , 'D_col' , 'E_col' , 'F_col' , 'G_col' , 'H_col' ],fill_value = None ) # 重新指定列索引 df.rename(columns = { 'COMPANY' : 'Company_New' }, inplace = True ) # 修改列名 df = df.dropna(axis = 0 ,how = 'all' ) # 去除 NAN 即文件中的空行 df[ 'CNUM' ] = df[ 'CNUM' ].astype( 'int32' ) # 将 CNUM 列的数据类型指定为 int32 df = df.drop_duplicates(subset = [ 'CNUM' , 'Company_New' ], keep = 'first' ) # 去除重复行 df.to_csv(newpath,index = False ,encoding = 'GBK' ) file_obj.close() if __name__ = = '__main__' : file_path = r 'C:UsersP78DesktoppythonCNUM_COMPANY.csv' file_save_path = r 'C:UsersP78DesktoppythonCNUM_COMPANY_OUTPUT.csv' deal_with_data(file_path,file_save_path) |
二,使用 VBA来处理:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
Option Base 1 Option Explicit Sub main() On Error GoTo error_handling Dim wb As Workbook Dim wb_out As Workbook Dim sht As Worksheet Dim sht_out As Worksheet Dim rng As Range Dim usedrows As Byte Dim usedrows_out As Byte Dim dict_cnum_company As Object Dim str_file_path As String Dim str_new_file_path As String 'assign values to variables: str_file_path = "C:UsersP78DesktopPythonCNUM_COMPANY.csv" str_new_file_path = "C:UsersP78DesktopPythonCNUM_COMPANY_OUTPUT.csv" Set wb = checkAndAttachWorkbook(str_file_path) Set sht = wb.Worksheets( "CNUM_COMPANY" ) Set wb_out = Workbooks.Add wb_out.SaveAs str_new_file_path, xlCSV 'create a csv file Set sht_out = wb_out.Worksheets( "CNUM_COMPANY_OUTPUT" ) Set dict_cnum_company = CreateObject( "Scripting.Dictionary" ) usedrows = WorksheetFunction. Max (getLastValidRow(sht, "A" ), getLastValidRow(sht, "B" )) 'rename the header ' COMPANY ' to ' Company_New',remove blank & duplicate lines / rows. Dim cnum_company As String cnum_company = "" For Each rng In sht. Range ( "A1" , "A" & usedrows) If VBA.Trim(rng.Offset( 0 , 1 ).Value) = "COMPANY" Then rng.Offset( 0 , 1 ).Value = "Company_New" End If cnum_company = rng.Value & "-" & rng.Offset( 0 , 1 ).Value If VBA.Trim(cnum_company) <> "-" And Not dict_cnum_company.Exists(rng.Value & "-" & rng.Offset( 0 , 1 ).Value) Then dict_cnum_company.Add rng.Value & "-" & rng.Offset( 0 , 1 ).Value, "" End If Next rng 'loop the keys of dict split the keyes by ' - ' into cnum array and company array. Dim index_dict As Byte Dim arr_cnum() Dim arr_Company() For index_dict = 0 To UBound(dict_cnum_company.keys) ReDim Preserve arr_cnum( 1 To UBound(dict_cnum_company.keys) + 1 ) ReDim Preserve arr_Company( 1 To UBound(dict_cnum_company.keys) + 1 ) arr_cnum(index_dict + 1 ) = Split(dict_cnum_company.keys()(index_dict), "-" )( 0 ) arr_Company(index_dict + 1 ) = Split(dict_cnum_company.keys()(index_dict), "-" )( 1 ) Debug. Print index_dict Next 'assigns the value of the arrays to the celles. sht_out. Range ( "A1" , "A" & UBound(arr_cnum)) = Application.WorksheetFunction.Transpose(arr_cnum) sht_out. Range ( "B1" , "B" & UBound(arr_Company)) = Application.WorksheetFunction.Transpose(arr_Company) 'add 6 columns to output csv file : Dim arr_columns() As Variant arr_columns = Array( "C_col" , "D_col" , "E_col" , "F_col" , "G_col" , "H_col" ) ' sht_out. Range ( "C1:H1" ) = arr_columns Call checkAndCloseWorkbook(str_file_path, False ) Call checkAndCloseWorkbook(str_new_file_path, True ) Exit Sub error_handling: Call checkAndCloseWorkbook(str_file_path, False ) Call checkAndCloseWorkbook(str_new_file_path, False ) End Sub ' 辅助函数: 'Get last row of Column N in a Worksheet Function getLastValidRow(in_ws As Worksheet, in_col As String) getLastValidRow = in_ws.Cells(in_ws.Rows.count, in_col).End(xlUp).Row End Function Function checkAndAttachWorkbook(in_wb_path As String) As Workbook Dim wb As Workbook Dim mywb As String mywb = in_wb_path For Each wb In Workbooks If LCase(wb.FullName) = LCase(mywb) Then Set checkAndAttachWorkbook = wb Exit Function End If Next Set wb = Workbooks. Open (in_wb_path, UpdateLinks: = 0 ) Set checkAndAttachWorkbook = wb End Function Function checkAndCloseWorkbook(in_wb_path As String, in_saved As Boolean) Dim wb As Workbook Dim mywb As String mywb = in_wb_path For Each wb In Workbooks If LCase(wb.FullName) = LCase(mywb) Then wb.Close savechanges: = in_saved Exit Function End If Next End Function |
三,输出结果:
两种方法输出结果相同:
四,比较总结:
Python pandas 内置了大量处理数据的方法,我们不需要重复造轮子,用起来很方便,代码简洁的多。
Excel VBA 处理这个需求,使用了 数组,字典等数据结构(实际需求中,数据量往往很大,所以一些地方没有直接使用遍历单元格的方法),以及处理字符串,数组和字典的很多方法,对文件的操作也很复杂,一旦出错,调试起来比python也较困难,代码已经尽量优化,但还是远比 Python要多。
到此这篇关于VBA处理数据与Python Pandas处理数据案例比较分析的文章就介绍到这了,更多相关VBA与Python Pandas处理数据内容请搜索服务器之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持服务器之家!
原文链接:https://blog.csdn.net/qq_24937551/article/details/105338086