图片版pdf无法复制,转化成文字版的pdf后使用更方便.
我们需要用到python3.6,pypdf2,ghostscript,pythonmagick,百度文字识别服务和pdfkit.
安装
安装python3.6 略
安装ghostscript
https://ghostscript.com/download/gsdnld.html
安装wkhtmltopdf
https://wkhtmltopdf.org/downloads.html
pip安装pypdf2,ghostscript,baidu-aip,pdfkit
1
2
3
4
|
pip install pypdf2 pip install ghostscript pip install baidu - aip pip install pdfkit |
pip安装pythonmagick
https://www.lfd.uci.edu/~gohlke/pythonlibs/
1
2
|
cd 下载目录 pip install pythonmagick‑ 0.9 . 13 ‑cp36‑cp36m‑win_amd64.whl |
pypdf2用于拆分和合并pdf
示例代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
#导入pdffilereader和pdffilewriter from pypdf2 import pdffilereader, pdffilewriter #获取一个pdf对象 pdf_input = pdffilereader( open (r 'pdf路径' , 'rb' )) #获取pdf页数 page_count = pdf_input.getnumpages() #获取pdf第四页的内容 page = pdf_input.getpage( 3 ) page[ '/contents' ] #获取一个pdfwriter对象 pdf_output = pdffilewriter() # 将一个 pageobject 加入到 pdffilewriter 中 pdf_output.addpage(page) #把新pdf保存 pdf_output.write( open (r '新pdf路径' , 'wb' )) |
pythonmagick用于将单页pdf转化为jpg
百度云-文字识别-python sdk
每天有500次免费的识别
示例代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
#导入baidu-aip from aip import aipocr #https://console.bce.baidu.com/#/index/overview #产品服务->人工智能->文字识别->创建应用 #获取以下三个值 app_id = '??' api_key = '??' secret_key = '?? ' #新建一个aipocr client = aipocr(app_id, api_key, secret_key) #读取本地图片的函数 def get_file_content(filepath): with open (filepath, 'rb' ) as fp: return fp.read() #读取本地图片 image = get_file_content( 'p1.jpg' ) #可选参数 options = {} options[ "language_type" ] = "chn_eng" options[ "detect_direction" ] = "true" options[ "detect_language" ] = "true" options[ "probability" ] = "true" #通用文字识别 client.basicgeneral(image, options) #读取网络图片 url = "https://note.youdao.com/yws/public/resource/1577071c1ffa2b6bf4e238ef6dbcfbf5/xmlnote/e5a19bedfeba4879b217c5bbf53b0245/22138" #可选参数 options = {} options[ "language_type" ] = "chn_eng" options[ "detect_direction" ] = "true" options[ "detect_language" ] = "true" options[ "probability" ] = "true" #通用文字识别 client.basicgeneralurl(url, options) #读取本地表格图片的函数 def get_file_content(filepath): with open (filepath, 'rb' ) as fp: return fp.read() #读取本地表格图片 image = get_file_content( 'p2.jpg' ) #可选参数 options = {} options[ "language_type" ] = "chn_eng" options[ "detect_direction" ] = "true" options[ "detect_language" ] = "true" options[ "probability" ] = "true" #通用文字识别 client.basicgeneral(image, options) #读取表格分割效果较差! |
pdfkit用于利用字符串生成pdf
示例代码如下:
1
2
3
4
5
6
7
|
#pdfkit安装位置设置 path_wk = r 'pdfkit安装位置设置' pdfkit_config = pdfkit.configuration(wkhtmltopdf = path_wk) #pdfkit参数 pdfkit_options = { 'encoding' : 'utf-8' ,} #制作pdf pdfkit.from_string(( 'string' ), 'd:\test.pdf' ,configuration = pdfkit_config,options = pdfkit_options) |
完整代码如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
#导入所需包 #os,操作文件和路径 import os #ghostscript,代码简化 import ghostscript #pypdf2,拆分pdf from pypdf2 import pdffilereader, pdffilewriter #pythonmagick,单页pdf转图片 from pythonmagick import image #baidu-aip,百度文字识别 from aip import aipocr #pdfkit,字符串制作pdf import pdfkit #参数 path = '??' pdfname = '??' dpi = '85' #https://console.bce.baidu.com/#/index/overview #产品服务->人工智能->文字识别->创建应用 #获取以下三个值 app_id = '??' api_key = '??' secret_key = '?? ' #pdfkit安装位置设置 path_wk = r 'pdfkit安装位置设置' pdfkit_config = pdfkit.configuration(wkhtmltopdf = path_wk) #pdfkit参数 pdfkit_options = { 'encoding' : 'utf-8' ,} #pdf转化为图片 os.chdir(path) pdf_input = pdffilereader( open (pdfname, 'rb' )) #自动获取pdf页数 page_count = pdf_input.getnumpages() page_range = range (page_count) #也可以手工指定pdf需要转换的页数 #page_range=range(0,100) #使用pypdf和ghostscript #==超级好用,超级直观,超级短== for page_num in page_range: im = image() im.density(dpi) im.read(pdfname + '[' + str (page_num) + ']' ) im.write( str (page_num) + '.jpg' ) #图片转化为字符串 #新建一个aipocr client = aipocr(app_id, api_key, secret_key) #读取本地图片的函数 def get_file_content(filepath): with open (filepath, 'rb' ) as fp: return fp.read() #可选参数 options = {} options[ "language_type" ] = "chn_eng" options[ "detect_direction" ] = "false" options[ "detect_language" ] = "false" options[ "probability" ] = "false" allteststr = [] for page_num in page_range: #读取本地图片 image = get_file_content(r '%s\%s.jpg' % (path,page_num)) #通用文字识别,得到的是一个dict testjson = client.basicgeneral(image, options) teststr = '' for x in testjson[ 'words_result' ]: teststr = teststr + x[ 'words' ] + '</br>' allteststr.append(teststr) #字符串写入pdf for page_num in page_range: pdfkit.from_string((allteststr[page_num]), '%s.pdf' % ( str (page_num)),configuration = pdfkit_config,options = pdfkit_options) #合并单页pdf pdf_output = pdffilewriter() for page_num in page_range: os.chdir(path) pdf_input = pdffilereader( open ( '%s.pdf' % ( str (page_num)), 'rb' )) page = pdf_input.getpage( 0 ) pdf_output.addpage(page) pdf_output.write( open ( 'newpdf.pdf' , 'wb' )) |
以上就是为大家介绍的如何使用python3.6,pypdf2,ghostscript,pythonmagick,百度文字识别服务和pdfkit
原文链接:https://blog.csdn.net/sqq513/article/details/79368243