在crnn训练的时候需要用到lmdb格式的数据集,下面是python生成lmdb个是数据集的代码,注意一定要在linux系统下,否则会读入图像的时候出问题,可能遇到的问题都在代码里面注释了,看代码即可。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
#-*- coding:utf-8 -*- import os import lmdb #先pip install这个模块哦 import cv2 import glob import numpy as np def checkImageIsValid(imageBin): if imageBin is None : return False imageBuf = np.fromstring(imageBin, dtype = np.uint8) img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE) if img is None : return False imgH, imgW = img.shape[ 0 ], img.shape[ 1 ] if imgH * imgW = = 0 : return False return True def writeCache(env, cache): with env.begin(write = True ) as txn: for k, v in cache.iteritems(): txn.put(k, v) def createDataset(outputPath, imagePathList, labelList, lexiconList = None , checkValid = True ): """ Create LMDB dataset for CRNN training. # ARGS: outputPath : LMDB output path imagePathList : list of image path labelList : list of corresponding groundtruth texts lexiconList : (optional) list of lexicon lists checkValid : if true, check the validity of every image """ # print (len(imagePathList) , len(labelList)) assert ( len (imagePathList) = = len (labelList)) nSamples = len (imagePathList) print '...................' env = lmdb. open (outputPath, map_size = 8589934592 ) #1099511627776)所需要的磁盘空间的最小值,之前是1T,我改成了8g,否则会报磁盘空间不足,这个数字是字节 cache = {} cnt = 1 for i in xrange (nSamples): imagePath = imagePathList[i] label = labelList[i] if not os.path.exists(imagePath): print ( '%s does not exist' % imagePath) continue with open (imagePath, 'r' ) as f: imageBin = f.read() if checkValid: if not checkImageIsValid(imageBin): print ( '%s is not a valid image' % imagePath) #注意一定要在linux下,否则f.read就不可用了,就会输出这个信息 continue imageKey = 'image-%09d' % cnt labelKey = 'label-%09d' % cnt cache[imageKey] = imageBin cache[labelKey] = label if lexiconList: lexiconKey = 'lexicon-%09d' % cnt cache[lexiconKey] = ' ' .join(lexiconList[i]) if cnt % 1000 = = 0 : writeCache(env, cache) cache = {} print ( 'Written %d / %d' % (cnt, nSamples)) cnt + = 1 nSamples = cnt - 1 cache[ 'num-samples' ] = str (nSamples) writeCache(env, cache) print ( 'Created dataset with %d samples' % nSamples) def read_text(path): with open (path) as f: text = f.read() text = text.strip() return text if __name__ = = '__main__' : # lmdb 输出目录 outputPath = 'D:/ruanjianxiazai/tuxiangyangben/fengehou/train' #训练集和验证集要跑两遍这个程序,分两次生成 path = "D:/ruanjianxiazai/tuxiangyangben/fengehou/chenguang/*.jpg" #将txt与jpg的都放在同一个文件里面 imagePathList = glob.glob(path) print '------------' , len (imagePathList), '------------' imgLabelLists = [] for p in imagePathList: try : imgLabelLists.append((p, read_text(p.replace( '.jpg' , '.txt' )))) except : continue # imgLabelList = [ (p, read_text(p.replace('.jpg', '.txt'))) for p in imagePathList] # sort by labelList imgLabelList = sorted (imgLabelLists, key = lambda x: len (x[ 1 ])) imgPaths = [ p[ 0 ] for p in imgLabelList] txtLists = [ p[ 1 ] for p in imgLabelList] createDataset(outputPath, imgPaths, txtLists, lexiconList = None , checkValid = True ) |
以上这篇python生成lmdb格式的文件实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/dulingtingzi/article/details/79585180