本文实例讲述了html">Python实现批量将word转html并将html内容发布至网站的方法。分享给大家供大家参考。具体实现方法如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
#coding=utf-8 __author__ = 'zhm' from win32com import client as wc import os import time import random import MySQLdb import re def wordsToHtml( dir ): #批量把文件夹的word文档转换成html文件 #金山WPS调用,抢先版的用KWPS,正式版WPS word = wc.Dispatch( 'KWPS.Application' ) for path, subdirs, files in os.walk( dir ): for wordFile in files: wordFullName = os.path.join(path, wordFile) #print "word:" + wordFullName doc = word.Documents. Open (wordFullName) wordFile2 = unicode (wordFile, "gbk" ) dotIndex = wordFile2.rfind( "." ) if (dotIndex = = - 1 ): print '********************ERROR: 未取得后缀名!' fileSuffix = wordFile2[(dotIndex + 1 ) : ] if (fileSuffix = = "doc" or fileSuffix = = "docx" ): fileName = wordFile2[ : dotIndex] htmlName = fileName + ".html" htmlFullName = os.path.join( unicode (path, "gbk" ), htmlName) # htmlFullName = unicode(path, "gbk") + "\\" + htmlName print u '生成了html文件:' + htmlFullName doc.SaveAs(htmlFullName, 8 ) doc.Close() word.Quit() print "" print "Finished!" def html_add_to_db( dir ): #将转换成功的html文件批量插入数据库中。 conn = MySQLdb.connect( host = 'localhost' , port = 3306 , user = 'root' , passwd = 'root' , db = 'test' , charset = 'utf8' ) cur = conn.cursor() for path, subdirs, files in os.walk( dir ): for htmlFile in files: htmlFullName = os.path.join(path, htmlFile) title = os.path.splitext(htmlFile)[ 0 ] targetDir = 'D:/files/htmls/' #D:/files为web服务器配置的静态目录 sconds = time.time() msconds = sconds * 1000 targetFile = os.path.join(targetDir, str ( int (msconds)) + str (random.randint( 100 , 10000 )) + '.html' ) htmlFile2 = unicode (htmlFile, "gbk" ) dotIndex = htmlFile2.rfind( "." ) if (dotIndex = = - 1 ): print '********************ERROR: 未取得后缀名!' fileSuffix = htmlFile2[(dotIndex + 1 ) : ] if (fileSuffix = = "htm" or fileSuffix = = "html" ): if not os.path.exists(targetDir): os.makedirs(targetDir) htmlFullName = os.path.join( unicode (path, "gbk" ), htmlFullName) htFile = open (htmlFullName, 'rb' ) #获取网页内容 htmStrCotent = htFile.read() #找出里面的图片 img = re. compile (r """<img\s.*?\s?src\s*=\s*['|"]?([^\s'"]+).*?>""" ,re.I) m = img.findall(htmStrCotent) for tagContent in m: imgSrc = unicode (tagContent, "gbk" ) imgSrcFullName = os.path.join(path, imgSrc) #上传图片 imgTarget = 'D:/files/images/whzx/' img_sconds = time.time() img_msconds = sconds * 1000 targetImgFile = os.path.join(imgTarget, str ( int (img_msconds)) + str (random.randint( 100 , 10000 )) + '.png' ) if not os.path.exists(imgTarget): os.makedirs(imgTarget) if not os.path.exists(targetImgFile) or (os.path.exists(targetImgFile) and (os.path.getsize(targetImgFile) ! = os.path.getsize(imgSrcFullName))): tmpImgFile = open (imgSrcFullName, 'rb' ) tmpWriteImgFile = open (targetImgFile, "wb" ) tmpWriteImgFile.write(tmpImgFile.read()) tmpImgFile.close() tmpWriteImgFile.close() htmStrCotent = htmStrCotent.replace(tagContent,targetImgFile.split( ":" )[ 1 ]) if not os.path.exists(targetFile) or (os.path.exists(targetFile) and (os.path.getsize(targetFile) ! = os.path.getsize(htmlFullName))): #用iframe包装转换好的html文件。 iframeHtml = ''' <script type="text/javascript" language="javascript"> function iFrameHeight() { var ifm= document.getElementById("iframepage"); var subWeb = document.frames ? document.frames["iframepage"].document:ifm.contentDocument; if(ifm != null && subWeb != null) { ifm.height = subWeb.body.scrollHeight; } } </script> <iframe src=''' + targetFile.split(': ')[1]+' '' marginheight = "0" marginwidth = "0" frameborder = "0" scrolling = "no" width = "765" height = 100 % id = "iframepage" name = "iframepage" onLoad = "iFrameHeight()" >< / iframe> ''' tmpTargetFile = open (targetFile, "wb" ) tmpTargetFile.write(htmStrCotent) tmpTargetFile.close() htFile.close() try : # 执行 sql = "insert into common_article(title,content) values(%s,%s)" param = ( unicode (title, "gbk" ),iframeHtml) cur.execute(sql,param) except : print "Error: unable to insert data" cur.close() conn.commit() # 关闭数据库连接 conn.close() if __name__ = = '__main__' : wordsToHtml( 'd:/word' ) html_add_to_db( 'd:/word' ) |
希望本文所述对大家的Python程序设计有所帮助。