问题
问题是这样的,要把一个数组存到tfrecord中,然后读取
1
2
3
4
5
|
a = np.array([[ 0 , 54 , 91 , 153 , 177 , 1 ], [ 0 , 50 , 89 , 147 , 196 ], [ 0 , 38 , 79 , 157 ], [ 0 , 49 , 89 , 147 , 177 ], [ 0 , 32 , 73 , 145 ]]) |
图片我都存储了,这个不还是小意思,一顿操作
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
import tensorflow as tf import numpy as np def _int64_feature(value): if not isinstance (value, list ): value = [value] return tf.train.Feature(int64_list = tf.train.Int64List(value = value)) # Write an array to TFrecord. # a is an array which contains lists of variant length. a = np.array([[ 0 , 54 , 91 , 153 , 177 , 1 ], [ 0 , 50 , 89 , 147 , 196 ], [ 0 , 38 , 79 , 157 ], [ 0 , 49 , 89 , 147 , 177 ], [ 0 , 32 , 73 , 145 ]]) writer = tf.python_io.TFRecordWriter( 'file' ) for i in range (a.shape[ 0 ]): feature = { 'i' : _int64_feature(i), 'data' : _int64_feature(a[i])} # Create an example protocol buffer example = tf.train.Example(features = tf.train.Features(feature = feature)) # Serialize to string and write on the file writer.write(example.SerializeToString()) writer.close() # Use Dataset API to read the TFRecord file. filenames = [ "file" ] dataset = tf.data.TFRecordDataset(filenames) def _parse_function(example_proto): keys_to_features = { 'i' :tf.FixedLenFeature([],tf.int64), 'data' :tf.FixedLenFeature([],tf.int64)} parsed_features = tf.parse_single_example(example_proto, keys_to_features) return parsed_features[ 'i' ], parsed_features[ 'data' ] dataset = dataset. map (_parse_function) dataset = dataset.shuffle(buffer_size = 1 ) dataset = dataset.repeat() dataset = dataset.batch( 1 ) iterator = dataset.make_one_shot_iterator() i, data = iterator.get_next() with tf.Session() as sess: print (sess.run([i, data])) print (sess.run([i, data])) print (sess.run([i, data])) |
报了奇怪的错误,Name: <unknown>, Key: data, Index: 0. Number of int64 values != expected. Values size: 6 but output shape: [] 这意思是我数据长度为6,但是读出来的是[],这到底是哪里错了,我先把读取的代码注释掉,看看tfreocrd有没有写成功,发现写成功了,这就表明是读取的问题,我怀疑是因为每次写入的长度是变化的原因,但是又有觉得不是,因为图片的尺寸都是不同的,我还是可以读取的,百思不得其解的时候我发现存储图片的时候是img.tobytes(),我把一个数组转换成了bytes,而且用的也是bytes存储,是不是tensorflow会把这个bytes当成一个元素,虽然每个图片的size不同,但是tobytes后tensorflow都会当成一个元素,然后读取的时候再根据(height,width,channel)来解析成图片。
我来试试不存为int64,而是存为bytes。 又是一顿厉害的操作
数据转为bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
# -*- coding: utf-8 -*- import tensorflow as tf import numpy as np def _byte_feature(value): return tf.train.Feature(bytes_list = tf.train.BytesList(value = [value])) def _int64_feature(value): if not isinstance (value, list ): value = [value] return tf.train.Feature(int64_list = tf.train.Int64List(value = value)) # Write an array to TFrecord. # a is an array which contains lists of variant length. a = np.array([[ 0 , 54 , 91 , 153 , 177 , 1 ], [ 0 , 50 , 89 , 147 , 196 ], [ 0 , 38 , 79 , 157 ], [ 0 , 49 , 89 , 147 , 177 ], [ 0 , 32 , 73 , 145 ]]) writer = tf.python_io.TFRecordWriter( 'file' ) for i in range (a.shape[ 0 ]): # i = 0 ~ 4 feature = { 'len' : _int64_feature( len (a[i])), # 将无意义的i改成len,为了后面还原 'data' : _byte_feature(np.array(a[i]).tobytes())} # 我也不知道为什么a[i]是list(后面就知道了),要存bytes需要numpy一下 # Create an example protocol buffer example = tf.train.Example(features = tf.train.Features(feature = feature)) # Serialize to string and write on the file writer.write(example.SerializeToString()) writer.close() # # Use Dataset API to read the TFRecord file. filenames = [ "file" ] dataset = tf.data.TFRecordDataset(filenames) def _parse_function(example_proto): keys_to_features = { 'len' :tf.FixedLenFeature([],tf.int64), 'data' :tf.FixedLenFeature([],tf.string)} # 改成string parsed_features = tf.parse_single_example(example_proto, keys_to_features) return parsed_features[ 'len' ], parsed_features[ 'data' ] dataset = dataset. map (_parse_function) dataset = dataset.shuffle(buffer_size = 1 ) dataset = dataset.repeat() dataset = dataset.batch( 1 ) iterator = dataset.make_one_shot_iterator() i, data = iterator.get_next() with tf.Session() as sess: print (sess.run([i, data])) print (sess.run([i, data])) print (sess.run([i, data])) """ [array([6], dtype=int64), array([b'\x00\x00\x00\x006\x00\x00\x00[\x00\x00\x00\x99\x00\x00\x00\xb1\x00\x00\x00\x01\x00\x00\x00'], dtype=object)] [array([5], dtype=int64), array([b'\x00\x00\x00\x002\x00\x00\x00Y\x00\x00\x00\x93\x00\x00\x00\xc4\x00\x00\x00'], dtype=object)] [array([4], dtype=int64), array([b'\x00\x00\x00\x00&\x00\x00\x00O\x00\x00\x00\x9d\x00\x00\x00'], dtype=object)] """ |
bytes数据解码
如愿的输出来了,但是这个bytes我该如何解码呢
方法一,我们自己解析
1
2
|
a,b = sess.run([i,data]) c = np.frombuffer(b[ 0 ],dtype = np. int ,count = a[ 0 ]) |
方法二使用tensorflow的解析函数
1
2
3
4
5
6
7
8
9
10
11
|
def _parse_function(example_proto): keys_to_features = { 'len' :tf.FixedLenFeature([],tf.int64), 'data' :tf.FixedLenFeature([],tf.string)} # 改成string parsed_features = tf.parse_single_example(example_proto, keys_to_features) dat = tf.decode_raw(parsed_features[ 'data' ],tf.int64) # 用的是这个解析函数,我们使用int64的格式存储的,解析的时候也是转换为int64 return parsed_features[ 'len' ], dat """ [array([6]), array([[ 0, 54, 91, 153, 177, 1]])] [array([5]), array([[ 0, 50, 89, 147, 196]])] [array([4]), array([[ 0, 38, 79, 157]])] """ |
可以看到是二维数组,这是因为我们使用的是batch输出,虽然我们的bathc_size=1,但是还是会以二维list的格式输出。我手贱再来修改点东西,
1
2
3
4
5
6
7
8
9
10
11
12
|
def _parse_function(example_proto): keys_to_features = { 'len' :tf.FixedLenFeature([ 1 ],tf.int64), 'data' :tf.FixedLenFeature([ 1 ],tf.string)} parsed_features = tf.parse_single_example(example_proto, keys_to_features) dat = tf.decode_raw(parsed_features[ 'data' ],tf.int64) return parsed_features[ 'len' ], dat """ [array([[6]]), array([[[ 0, 54, 91, 153, 177, 1]]])] [array([[5]]), array([[[ 0, 50, 89, 147, 196]]])] [array([[4]]), array([[[ 0, 38, 79, 157]]])] """ |
呦呵,又变成3维的了,让他报个错试试
1
2
3
4
5
6
7
8
9
10
11
|
def _parse_function(example_proto): keys_to_features = { 'len' :tf.FixedLenFeature([ 2 ],tf.int64), # 1 修改为 2 'data' :tf.FixedLenFeature([ 1 ],tf.string)} # 改成string parsed_features = tf.parse_single_example(example_proto, keys_to_features) return parsed_features[ 'len' ], parsed_features[ 'data' ] """ InvalidArgumentError: Key: len. Can't parse serialized Example. [[Node: ParseSingleExample/ParseSingleExample = ParseSingleExample[Tdense=[DT_STRING, DT_INT64], dense_keys=["data", "len"], dense_shapes=[[1], [2]], num_sparse=0, sparse_keys=[], sparse_types=[]](arg0, ParseSingleExample/Const, ParseSingleExample/Const_1)]] [[Node: IteratorGetNext_22 = IteratorGetNext[output_shapes=[[?,2], [?,1]], output_types=[DT_INT64, DT_STRING], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator_22)]] """ |
可以看到dense_keys=["data", "len"], dense_shapes=[[1], [2]],,tf.FixedLenFeature是读取固定长度的数据,我猜测[]的意思就是读取全部数据,[1]就是读取一个数据,每个数据可能包含多个数据,形如[[1,2],[3,3,4],[2]....],哈哈这都是我瞎猜的,做我女朋友好不好。
tensorflow 变长数组存储
反正是可以读取了。但是如果是自己定义的变长数组,每次都要自己解析,这样很麻烦(我瞎遍的),所以tensorflow就定义了变长数组的解析方法tf.VarLenFeature,我们就不需要把边长数组变为bytes再解析了,又是一顿操作
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
import tensorflow as tf import numpy as np def _int64_feature(value): if not isinstance (value, list ): value = [value] return tf.train.Feature(int64_list = tf.train.Int64List(value = value)) # Write an array to TFrecord. # a is an array which contains lists of variant length. a = np.array([[ 0 , 54 , 91 , 153 , 177 , 1 ], [ 0 , 50 , 89 , 147 , 196 ], [ 0 , 38 , 79 , 157 ], [ 0 , 49 , 89 , 147 , 177 ], [ 0 , 32 , 73 , 145 ]]) writer = tf.python_io.TFRecordWriter( 'file' ) for i in range (a.shape[ 0 ]): # i = 0 ~ 4 feature = { 'i' : _int64_feature(i), 'data' : _int64_feature(a[i])} # Create an example protocol buffer example = tf.train.Example(features = tf.train.Features(feature = feature)) # Serialize to string and write on the file writer.write(example.SerializeToString()) writer.close() # Use Dataset API to read the TFRecord file. filenames = [ "file" ] dataset = tf.data.TFRecordDataset(filenames) def _parse_function(example_proto): keys_to_features = { 'i' :tf.FixedLenFeature([],tf.int64), 'data' :tf.VarLenFeature(tf.int64)} parsed_features = tf.parse_single_example(example_proto, keys_to_features) return parsed_features[ 'i' ], tf.sparse_tensor_to_dense(parsed_features[ 'data' ]) dataset = dataset. map (_parse_function) dataset = dataset.shuffle(buffer_size = 1 ) dataset = dataset.repeat() dataset = dataset.batch( 1 ) iterator = dataset.make_one_shot_iterator() i, data = iterator.get_next() with tf.Session() as sess: print (sess.run([i, data])) print (sess.run([i, data])) print (sess.run([i, data])) """ [array([0], dtype=int64), array([[ 0, 54, 91, 153, 177, 1]], dtype=int64)] [array([1], dtype=int64), array([[ 0, 50, 89, 147, 196]], dtype=int64)] [array([2], dtype=int64), array([[ 0, 38, 79, 157]], dtype=int64)] """ |
batch输出
输出还是数组,哈哈哈。再来一波操作
1
2
3
4
|
dataset = dataset.batch( 2 ) """ Cannot batch tensors with different shapes in component 1. First element had shape [6] and element 1 had shape [5]. """ |
这是因为一个batch中数据的shape必须是一致的,第一个元素长度为6,第二个元素长度为5,就会报错。办法就是补成一样的长度,在这之前先测试点别的
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
a = np.array([[ 0 , 54 , 91 , 153 , 177 , 1 ], [ 0 , 50 , 89 , 147 , 196 ], [ 0 , 38 , 79 , 157 ], [ 0 , 49 , 89 , 147 , 177 ], [ 0 , 32 , 73 , 145 ]]) for i in range (a.shape[ 0 ]): print ( type (a[i])) """ <class 'list'> <class 'list'> <class 'list'> <class 'list'> <class 'list'> """ |
可以发现长度不一的array每一个数据是list(一开始我以为是object)。然后补齐
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
a = np.array([[ 0 , 54 , 91 , 153 , 177 , 1 ], [ 0 , 50 , 89 , 147 , 196 , 0 ], [ 0 , 38 , 79 , 157 , 0 , 0 ], [ 0 , 49 , 89 , 147 , 177 , 0 ], [ 0 , 32 , 73 , 145 , 0 , 0 ]]) for i in range (a.shape[ 0 ]): print ( type (a[i])) """ <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> """ |
返回的是numpy。为什么要做这件事呢?
1
2
3
4
|
def _int64_feature(value): if not isinstance (value, list ): value = [value] return tf.train.Feature(int64_list = tf.train.Int64List(value = value)) |
tensorflow要求我们输入的是list或者直接是numpy.ndarry,如果是list中包含numpy.ndarry [numpy.ndarry]就会报错。上面的那个数组时边长的,返回的时list,没有什么错误,我们补齐看看
1
2
3
4
5
6
7
8
9
|
a = np.array([[ 0 , 54 , 91 , 153 , 177 , 1 ], [ 0 , 50 , 89 , 147 , 196 , 0 ], [ 0 , 38 , 79 , 157 , 0 , 0 ], [ 0 , 49 , 89 , 147 , 177 , 0 ], [ 0 , 32 , 73 , 145 , 0 , 0 ]]) """ TypeError: only size-1 arrays can be converted to Python scalars """ |
这就是因为返回的不是list,而是numpy.ndarry,而_int64_feature函数中先判断numpy.ndarry不是list,所以转成了[numpy.ndarry]就报错了。可以做些修改,一种方法是将numpy.ndarry转为list
1
2
3
|
for i in range (a.shape[ 0 ]): # i = 0 ~ 4 feature = { 'i' : _int64_feature(i), 'data' : _int64_feature(a[i].tolist())} |
这样补齐了我们就可以修改batch的值了
1
2
3
4
5
6
7
8
9
10
|
dataset = dataset.batch( 2 ) """ [array([0, 2], dtype=int64), array([[ 0, 54, 91, 153, 177, 1], [ 0, 38, 79, 157, 0, 0]], dtype=int64)] [array([1, 3], dtype=int64), array([[ 0, 50, 89, 147, 196, 0], [ 0, 49, 89, 147, 177, 0]], dtype=int64)] [array([4, 0], dtype=int64), array([[ 0, 32, 73, 145, 0, 0], [ 0, 54, 91, 153, 177, 1]], dtype=int64)] """ |
当然tensorflow不会让我自己补齐,已经提供了补齐函数padded_batch,
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
# -*- coding: utf-8 -*- import tensorflow as tf def _int64_feature(value): if not isinstance (value, list ): value = [value] return tf.train.Feature(int64_list = tf.train.Int64List(value = value)) a = [[ 0 , 54 , 91 , 153 , 177 , 1 ], [ 0 , 50 , 89 , 147 , 196 ], [ 0 , 38 , 79 , 157 ], [ 0 , 49 , 89 , 147 , 177 ], [ 0 , 32 , 73 , 145 ]] writer = tf.python_io.TFRecordWriter( 'file' ) for v in a: # i = 0 ~ 4 feature = { 'data' : _int64_feature(v)} # Create an example protocol buffer example = tf.train.Example(features = tf.train.Features(feature = feature)) # Serialize to string and write on the file writer.write(example.SerializeToString()) writer.close() # Use Dataset API to read the TFRecord file. filenames = [ "file" ] dataset = tf.data.TFRecordDataset(filenames) def _parse_function(example_proto): keys_to_features = { 'data' :tf.VarLenFeature(tf.int64)} parsed_features = tf.parse_single_example(example_proto, keys_to_features) return tf.sparse_tensor_to_dense( parsed_features[ 'data' ]) dataset = dataset. map (_parse_function) dataset = dataset.shuffle(buffer_size = 1 ) dataset = dataset.repeat() dataset = dataset.padded_batch( 2 ,padded_shapes = ([ None ])) iterator = dataset.make_one_shot_iterator() data = iterator.get_next() with tf.Session() as sess: print (sess.run([data])) print (sess.run([data])) print (sess.run([data])) """ [array([[ 0, 54, 91, 153, 177, 1], [ 0, 50, 89, 147, 196, 0]])] [array([[ 0, 38, 79, 157, 0], [ 0, 49, 89, 147, 177]])] [array([[ 0, 32, 73, 145, 0, 0], [ 0, 54, 91, 153, 177, 1]])] """ |
可以看到的确是自动补齐了。
图片batch
直接来测试一下图片数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
# -*- coding: utf-8 -*- import tensorflow as tf import matplotlib.pyplot as plt def _byte_feature(value): return tf.train.Feature(bytes_list = tf.train.BytesList(value = [value])) files = tf.gfile.Glob( '*.jpeg' ) writer = tf.python_io.TFRecordWriter( 'file' ) for file in files: with tf.gfile.FastGFile( file , 'rb' ) as f: img_buff = f.read() feature = { 'img' : _byte_feature(tf.compat.as_bytes(img_buff))} example = tf.train.Example(features = tf.train.Features(feature = feature)) writer.write(example.SerializeToString()) writer.close() filenames = [ "file" ] dataset = tf.data.TFRecordDataset(filenames) def _parse_function(example_proto): keys_to_features = { 'img' :tf.FixedLenFeature([], tf.string)} parsed_features = tf.parse_single_example(example_proto, keys_to_features) image = tf.image.decode_jpeg(parsed_features[ 'img' ]) return image dataset = dataset. map (_parse_function) dataset = dataset.shuffle(buffer_size = 1 ) dataset = dataset.repeat() dataset = dataset.batch( 2 ) iterator = dataset.make_one_shot_iterator() image = iterator.get_next() with tf.Session() as sess: img = sess.run([image]) print ( len (img)) print (img[ 0 ].shape) plt.imshow(img[ 0 ][ 0 ]) """ Cannot batch tensors with different shapes in component 0. First element had shape [440,440,3] and element 1 had shape [415,438,3]. """ |
看到了没有,一个batch中图片的尺寸不同,就不可以batch了,我们必须要将一个batch的图片resize成相同的代大小。
1
2
3
4
5
6
7
|
def _parse_function(example_proto): keys_to_features = { 'img' :tf.FixedLenFeature([], tf.string)} parsed_features = tf.parse_single_example(example_proto, keys_to_features) image = tf.image.decode_jpeg(parsed_features[ 'img' ]) image = tf.image.convert_image_dtype(image,tf.float32) # 直接resize,会将uint8转为float类型,但是plt.imshow只能显示uint8或者0-1之间float类型,这个函数就是将uint8转为0-1之间的float类型,相当于除以255.0 image = tf.image.resize_images(image,( 224 , 224 )) return image |
但是有时候我们希望输入图片尺寸是不一样的,不需要reize,这样只能将batch_size=1。一个batch中的图片shape必须是一样的,我们可以这样折中训练,使用tensorflow提供的动态填充接口,将一个batch中的图片填充为相同的shape。
1
|
dataset = dataset.padded_batch( 2 ,padded_shapes = ([ None , None , 3 ])) |
如果我们想要将图片的名称作为标签保存下来要怎么做呢?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
# -*- coding: utf-8 -*- import tensorflow as tf import matplotlib.pyplot as plt import os out_charset = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" def _byte_feature(value): return tf.train.Feature(bytes_list = tf.train.BytesList(value = [value])) def _int64_feature(values): if not isinstance (values, list ): values = [values] return tf.train.Feature(int64_list = tf.train.Int64List(value = values)) files = tf.gfile.Glob( '*.jpg' ) writer = tf.python_io.TFRecordWriter( 'file' ) for file in files: with tf.gfile.FastGFile( file , 'rb' ) as f: img_buff = f.read() filename = os.path.basename( file ).split( '.' )[ 0 ] label = list ( map ( lambda x:out_charset.index(x),filename)) feature = { 'label' :_int64_feature(label), 'filename' :_byte_feature(tf.compat.as_bytes(filename)), 'img' : _byte_feature(tf.compat.as_bytes(img_buff))} example = tf.train.Example(features = tf.train.Features(feature = feature)) writer.write(example.SerializeToString()) writer.close() filenames = [ "file" ] dataset = tf.data.TFRecordDataset(filenames) def _parse_function(example_proto): keys_to_features = { 'label' :tf.VarLenFeature(tf.int64), 'filename' :tf.FixedLenFeature([],tf.string), 'img' :tf.FixedLenFeature([], tf.string)} parsed_features = tf.parse_single_example(example_proto, keys_to_features) label = tf.sparse_tensor_to_dense(parsed_features[ 'label' ]) filename = parsed_features[ 'filename' ] image = tf.image.decode_jpeg(parsed_features[ 'img' ]) return image,label,filename dataset = dataset. map (_parse_function) dataset = dataset.shuffle(buffer_size = 1 ) dataset = dataset.repeat() dataset = dataset.padded_batch( 3 ,padded_shapes = ([ None , None , 3 ],[ None ],[])) #因为返回有三个,所以每一个都要有padded_shapes,但是解码后的image和label都是变长的 #所以需要pad None,而filename没有解码,返回来是byte类型的,只有一个值,所以不需要pad iterator = dataset.make_one_shot_iterator() image,label,filename = iterator.get_next() with tf.Session() as sess: print (label. eval ()) |
瞎试
如果写入的数据是一个list会是怎样呢
1
2
3
4
5
|
a = np.arange( 16 ).reshape( 2 , 4 , 2 ) """ TypeError: [0, 1] has type list, but expected one of: int, long """ |
不过想想也是,tf.train.Feature(int64_list=tf.train.Int64List(value=value))这个函数就是存储数据类型为int64的list的。但是如果我们要存储词向量该怎么办呢?例如一句话是一个样本s1='我爱你',假如使用one-hot编码,我=[0,0,1],爱=[0,1,0],你=[1,0,0],s1=[[0,0,1],[0,1,0],[1,0,0]]。这一个样本该怎么存储呢?
以上这篇tensorflow 变长序列存储实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/he_wen_jie/article/details/80269256