Go语言模型:string的底层数据结构与高效操作详解_Golang

Golang的string类型底层数据结构简单，本质也是一个结构体实例，且是const不可变。

string的底层数据结构

通过下面一个例子来看：

				?

									package main

									import (

									    "fmt"

									    "unsafe"

									)

									// from: string.go 在GoLand IDE中双击shift快速找到

									type stringStruct struct {

									    array unsafe.Pointer // 指向一个 [len]byte 的数组

									    length int    // 长度

									}

									func main() {

									    test := "hello"

									    p := (*str)(unsafe.Pointer(&test))

									    fmt.Println(&p, p) // 0xc420070018 &{0xa3f71 5}

									    c := make([]byte, p.length)

									    for i := 0; i < p.length; i++ {

									        tmp := uintptr(unsafe.Pointer(p.array))   // 指针类型转换通过unsafe包

									        c[i] = *(*byte)(unsafe.Pointer(tmp + uintptr(i))) // 指针运算只能通过uintptr

									    }

									    fmt.Println(c)   // [104 101 108 108 111]

									    fmt.Println(string(c)) // [byte] --> string, "hello"

									    test2 := test + " world" // 字符串是不可变类型，会生成一个新的string实例

									    p2 := (*str)(unsafe.Pointer(&test2))

									    fmt.Println(&p2, p2) // 0xc420028030 &{0xc42000a2e5 11}

									    fmt.Println(test2) // hello, world

									}

string的拼接与修改

+操作

string类型是一个不可变类型，那么任何对string的修改都会新生成一个string的实例，如果是考虑效率的场景就要好好考虑一下如何修改了。先说一下最长用的+操作，同样上面的例子，看一下+操作拼接字符串的反汇编：

				?

									25      test2 := test + " world"

									 0x00000000004824d7 <+1127>:  lea 0x105a2(%rip),%rax  # 0x492a80

									 0x00000000004824de <+1134>:  mov %rax,(%rsp)

									 0x00000000004824e2 <+1138>:  callq 0x40dda0 <runtime.newobject> # 调用newobject函数

									 0x00000000004824e7 <+1143>:  mov 0x8(%rsp),%rax

									 0x00000000004824ec <+1148>:  mov %rax,0xa0(%rsp)

									 0x00000000004824f4 <+1156>:  mov 0xa8(%rsp),%rax

									 0x00000000004824fc <+1164>:  mov 0x8(%rax),%rcx

									 0x0000000000482500 <+1168>:  mov (%rax),%rax

									 0x0000000000482503 <+1171>:  mov %rax,0x8(%rsp)

									 0x0000000000482508 <+1176>:  mov %rcx,0x10(%rsp)

									 0x000000000048250d <+1181>:  movq $0x0,(%rsp)

									 0x0000000000482515 <+1189>:  lea 0x30060(%rip),%rax  # 0x4b257c

									 0x000000000048251c <+1196>:  mov %rax,0x18(%rsp)

									 0x0000000000482521 <+1201>:  movq $0x6,0x20(%rsp)

									 0x000000000048252a <+1210>:  callq 0x43cc00 <runtime.concatstring2> # 调用concatstring2函数

因为当前go[2018.11 version: go1.11]的不是遵循默认的x86 calling convention用寄存器传参，而是通过stack进行传参，所以go的反汇编不像c的那么容易理解，不过大概看懂+背后的操作还是没问题的，看一下runtime源码的拼接函数：

				?

									func concatstring2(buf *tmpBuf, a [2]string) string {

									 return concatstrings(buf, a[:])

									}

									// concatstrings implements a Go string concatenation x+y+z+...

									// The operands are passed in the slice a.

									// If buf != nil, the compiler has determined that the result does not

									// escape the calling function, so the string data can be stored in buf

									// if small enough.

									func concatstrings(buf *tmpBuf, a []string) string {

									 idx := 0

									 l := 0

									 count := 0

									 for i, x := range a {

									  n := len(x)

									  if n == 0 {

									   continue

									  }

									  if l+n < l {

									   throw("string concatenation too long")

									  }

									  l += n

									  count++

									  idx = i

									 }

									 if count == 0 {

									  return ""

									 }

									 // If there is just one string and either it is not on the stack

									 // or our result does not escape the calling frame (buf != nil),

									 // then we can return that string directly.

									 if count == 1 && (buf != nil || !stringDataOnStack(a[idx])) {

									  return a[idx]

									 }

									 s, b := rawstringtmp(buf, l)

									 for _, x := range a {

									  copy(b, x) // 最关键的拷贝操作

									  b = b[len(x):]

									 }

									 return s

									}

分析runtime的concatstrings实现，可以看出+最后新申请buf，拷贝原来的string到buf，最后返回新实例。那么每次的+操作，都会涉及新申请buf，然后是对应的copy。如果反复使用+，就不可避免有大量的申请内存操作，对于大量的拼接，性能就会受到影响了。

bytes.Buffer

通过看源码，bytes.Buffer 增长buffer时是按照2倍来增长内存，可以有效避免频繁的申请内存，通过一个例子来看：

				?

									func main() {

									 var buf bytes.Buffer

									 for i := 0; i < 10; i++ {

									  buf.WriteString("hi ")

									 }

									 fmt.Println(buf.String())

									}

对应的byte包库函数源码

				?

									// @file: buffer.go

									func (b *Buffer) WriteString(s string) (n int, err error) {

									 b.lastRead = opInvalid

									 m, ok := b.tryGrowByReslice(len(s))

									 if !ok {

									  m = b.grow(len(s)) // 高效的增长策略 -> let capacity get twice as large

									 }

									 return copy(b.buf[m:], s), nil

									}

									// @file: buffer.go

									// let capacity get twice as large !!!

									func (b *Buffer) grow(n int) int {

									 m := b.Len()

									 // If buffer is empty, reset to recover space.

									 if m == 0 && b.off != 0 {

									  b.Reset()

									 }

									 // Try to grow by means of a reslice.

									 if i, ok := b.tryGrowByReslice(n); ok {

									  return i

									 }

									 // Check if we can make use of bootstrap array.

									 if b.buf == nil && n <= len(b.bootstrap) {

									  b.buf = b.bootstrap[:n]

									  return 0

									 }

									 c := cap(b.buf)

									 if n <= c/2-m {

									  // We can slide things down instead of allocating a new

									  // slice. We only need m+n <= c to slide, but

									  // we instead let capacity get twice as large so we

									  // don't spend all our time copying.

									  copy(b.buf, b.buf[b.off:])

									 } else if c > maxInt-c-n {

									  panic(ErrTooLarge)

									 } else {

									  // Not enough space anywhere, we need to allocate.

									  buf := makeSlice(2*c + n)

									  copy(buf, b.buf[b.off:])

									  b.buf = buf

									 }

									 // Restore b.off and len(b.buf).

									 b.off = 0

									 b.buf = b.buf[:m+n]

									 return m

									}

string.join

这个函数可以一次申请最终string的大小，但是使用得预先准备好所有string，这种场景也是高效的，一个例子：

				?

									func main() {

									 var strs []string

									 for i := 0; i < 10; i++ {

									 strs = append(strs, "hi")

									 }

									 fmt.Println(strings.Join(strs, " "))

									}

对应库的源码：

				?

									// Join concatenates the elements of a to create a single string. The separator string

									// sep is placed between elements in the resulting string.

									func Join(a []string, sep string) string {

									 switch len(a) {

									 case 0:

									  return ""

									 case 1:

									  return a[0]

									 case 2:

									  // Special case for common small values.

									  // Remove if golang.org/issue/6714 is fixed

									  return a[0] + sep + a[1]

									 case 3:

									  // Special case for common small values.

									  // Remove if golang.org/issue/6714 is fixed

									  return a[0] + sep + a[1] + sep + a[2]

									 }

									 // 计算好最终的string的大小

									 n := len(sep) * (len(a) - 1) //

									 for i := 0; i < len(a); i++ {

									  n += len(a[i])

									 }

									 b := make([]byte, n)

									 bp := copy(b, a[0])

									 for _, s := range a[1:] {

									  bp += copy(b[bp:], sep)

									  bp += copy(b[bp:], s)

									 }

									 return string(b)

									}

strings.Builder (go1.10+)

看到这个名字，就想到了Java的库，哈哈，这个Builder用起来是最方便的，不过是在1.10后引入的。其高效也是体现在2倍速的内存增长, WriteString函数利用了slice类型对应append函数的2倍速增长。

一个例子：

				?

									func main() {

									 var s strings.Builder

									 for i := 0; i < 10; i++ {

									  s.WriteString("hi ")

									 }

									 fmt.Println(s.String())

									}

对应库的源码

				?

									@file: builder.go

									// WriteString appends the contents of s to b's buffer.

									// It returns the length of s and a nil error.

									func (b *Builder) WriteString(s string) (int, error) {

									 b.copyCheck()

									 b.buf = append(b.buf, s...)

									 return len(s), nil

									}

总结

Golang的字符串处理还是挺方便的，有垃圾回收和一些内置的语言级写法支持，让复杂字符串操作没有那么繁琐了，比起C/C++高效了不少。

补充：go string的内部实现

go string 内部实现

这个string的探索

来来个例子

				?

									func boo(a int, b int)(int, string){

									 return a + b, "abcd"

									}

				?

									81079 000000000044dfa0 <main.boo>:

									81080 44dfa0:>------48 c7 44 24 18 00 00 >--movq $0x0,0x18(%rsp)

									81081 44dfa7:>------00 00- 

									81082 44dfa9:>------0f 57 c0    >--xorps %xmm0,%xmm0

									81083 44dfac:>------0f 11 44 24 20  >--movups %xmm0,0x20(%rsp)

									81084 44dfb1:>------48 8b 44 24 08  >--mov 0x8(%rsp),%rax

									81085 44dfb6:>------48 03 44 24 10  >--add 0x10(%rsp),%rax

									81086 44dfbb:>------48 89 44 24 18  >--mov %rax,0x18(%rsp)

									81087 44dfc0:>------48 8d 05 d4 eb 01 00 >--lea 0x1ebd4(%rip),%rax  # 46cb9b <go.string.*+0xbb>

									81088 44dfc7:>------48 89 44 24 20  >--mov %rax,0x20(%rsp)

									81089 44dfcc:>------48 c7 44 24 28 04 00 >--movq $0x4,0x28(%rsp)

									81090 44dfd3:>------00 00- 

									81091 44dfd5:>------c3     >--retq---

其中

				?

									81087 44dfc0:>------48 8d 05 d4 eb 01 00 >--lea 0x1ebd4(%rip),%rax  # 46cb9b <go.string.*+0xbb>

									81088 44dfc7:>------48 89 44 24 20  >--mov %rax,0x20(%rsp)

									81089 44dfcc:>------48 c7 44 24 28 04 00 >--movq $0x4,0x28(%rsp)

									81090 44dfd3:>------00 00- 

									81091 44dfd5:>------c3     >--retq---

									lea 0x1ebd4(%rip),%rax得到char*, mov %rax,0x20(%rsp)复制给返回值, movq $0x4,0x28(%rsp)把长度也填进去,