GOLANG 代码优化的失败经历
第一败不是查表就一定比多个条件判断更快要看数据的分布一个解析 json 中的数值的函数中有这样一个判断字符范围的函数:func isNumberChar(ch byte) bool { return (ch 0 ch 9) || ch . || ch - || ch e || ch E || ch }如果一个字符不属于上面的字符集那么需要 7 次比较运算。所以我的优化思路是建立一个 256 个元素的查找表每个字符进行一次查表就行了。我优化后的代码如下var numberSearchTable func() [256]bool { var table [256]bool for _, c : range []byte(0123456789.eE-) { table[c] true } return table }() func isNumberChar(ch byte) bool { return numberSearchTable[ch] }实际运行发现并没有比原版更快。原因如下•(ch 0 ch 9)会被优化成tmp ch - 0; tmp 9• 输入分布很关键如果大多数字符是数字isNumberChar 的第一条范围判断命中率很高比较链会非常占优。• 分支预测很关键比较链快不快很看数据分布是否稳定。• load 成本也关键查表版本一定有一次内存读取哪怕表很小也不等于一定更快。简单的来说对于这个解析库而言大多数情况下这个函数输入的内容大概率都是 0 ~ 9所以原本更快。教训不是想当然的减少计算指令还要考虑数据的分布第二败不是组合满了 8 个 switch 语句的 case 常量编译器就会把代码编译为 JumpTable 的模式我看到了如下函数:func (c *cache) parseValue(s string, depth int) (*Value, string, error) { if len(s) 0 { return nil, s, fmt.Errorf(cannot parse empty string) } depth if depth MaxDepth { return nil, s, fmt.Errorf(too big depth for the nested JSON; it exceeds %d, MaxDepth) } if s[0] { { v, tail, err : c.parseObject(s[1:], depth) if err ! nil { return nil, tail, fmt.Errorf(cannot parse object: %s, err) } return v, tail, nil } if s[0] [ { v, tail, err : c.parseArray(s[1:], depth) if err ! nil { return nil, tail, fmt.Errorf(cannot parse array: %s, err) } return v, tail, nil } if s[0] { ss, tail, err : parseRawString(s[1:]) if err ! nil { return nil, tail, fmt.Errorf(cannot parse string: %s, err) } v : c.getValue() v.t typeRawString v.s ss return v, tail, nil } if s[0] t { if len(s) len(true) || s[:len(true)] ! true { return nil, s, fmt.Errorf(unexpected value found: %q, s) } return valueTrue, s[len(true):], nil } if s[0] f { if len(s) len(false) || s[:len(false)] ! false { return nil, s, fmt.Errorf(unexpected value found: %q, s) } return valueFalse, s[len(false):], nil } if s[0] n { if len(s) len(null) || s[:len(null)] ! null { // Try parsing NaN if len(s) 3 strings.EqualFold(s[:3], nan) { v : c.getValue() v.t TypeNumber v.s s[:3] return v, s[3:], nil } return nil, s, fmt.Errorf(unexpected value found: %q, s) } return valueNull, s[len(null):], nil } ns, tail, err : parseRawNumber(s) if err ! nil { return nil, tail, fmt.Errorf(cannot parse number: %s, err) } v : c.getValue() v.t TypeNumber v.s ns return v, tail, nil }我认为是if s[0]xx这里有点浪费越往后的代码就要经历越多的条件判断。如果我把所有常量凑够 8 个让编译器把顺序的条件判断变成 JumpTable 那么根据常量就能跳转到对应的代码。我修改成了以下:func (c *cache) parseValue(s string, depth int) (*Value, string, error) { if len(s) 0 { return nil, s, fmt.Errorf(cannot parse empty string) } depth if depth MaxDepth { return nil, s, fmt.Errorf(too big depth for the nested JSON; it exceeds %d, MaxDepth) } // 尝试使用 jump table switch s[0] { case {: v, tail, err : c.parseObject(s[1:], depth) if err ! nil { return nil, tail, fmt.Errorf(cannot parse object: %s, err) } return v, tail, nil case [: v, tail, err : c.parseArray(s[1:], depth) if err ! nil { return nil, tail, fmt.Errorf(cannot parse array: %s, err) } return v, tail, nil case : ss, tail, err : parseRawString(s[1:]) if err ! nil { return nil, tail, fmt.Errorf(cannot parse string: %s, err) } v : c.getValue() v.t typeRawString v.s ss return v, tail, nil case t: if len(s) len(true) || s[:len(true)] ! true { return nil, s, fmt.Errorf(unexpected value found: %q, s) } return valueTrue, s[len(true):], nil case f: if len(s) len(false) || s[:len(false)] ! false { return nil, s, fmt.Errorf(unexpected value found: %q, s) } return valueFalse, s[len(false):], nil case n: if len(s) len(null) || s[:len(null)] ! null { // Try parsing NaN if len(s) 3 strings.EqualFold(s[:3], nan) { v : c.getValue() v.t TypeNumber v.s s[:3] return v, s[3:], nil } return nil, s, fmt.Errorf(unexpected value found: %q, s) } return valueNull, s[len(null):], nil case : panic(to make complier happy, this case will never be hit, since skipWS is called before parseValue) case \t: panic(to make complier happy, this case will never be hit, since skipWS is called before parseValue 2) default: ns, tail, err : parseRawNumber(s) if err ! nil { return nil, tail, fmt.Errorf(cannot parse number: %s, err) } v : c.getValue() v.t TypeNumber v.s ns return v, tail, nil } }我知道 golang 编译器对于 switch case 的常量要 8 个以上才会编译为 jump table所以特意凑够了 8 个。运行后发现性能退化很多且通过反汇编发现并未生成 jump table.我的错误如下jump table 本质上是减少跳转的次数提升分支预测的能力。当 switch case 内的代码块是纯计算时jump table 减少的跳转是有效的当 switch case 内的代码块 本身就有很多跳转时编译为 jump table 就没意义了switch case 内的常量不是连续的常量这样也不会生成 jump table.第三败循环展开确实让某些路径变快了但是数据分布导致热路径反而慢了我看见了这样的 uint64 解析的函数// ParseUint64 parses uint64 from s. // // It is equivalent to strconv.ParseUint(s, 10, 64), but is faster. // // See also ParseUint64BestEffort. func ParseUint64(s string) (uint64, error) { if len(s) 0 { return 0, fmt.Errorf(cannot parse uint64 from empty string) } i : uint(0) d : uint64(0) j : i for i uint(len(s)) { if s[i] 0 s[i] 9 { d d*10 uint64(s[i]-0) i if i 18 { // The integer part may be out of range for uint64. // Fall back to slow parsing. dd, err : strconv.ParseUint(s, 10, 64) if err ! nil { return 0, err } return dd, nil } continue } break } if i j { return 0, fmt.Errorf(cannot parse uint64 from %q, s) } if i uint(len(s)) { // Unparsed tail left. return 0, fmt.Errorf(unparsed tail left after parsing uint64 from %q: %q, s, s[i:]) } return d, nil }我的想法是表示一个 uint64 的字符串是有限的长度那么就可以通过循环展开来提升性能。我写入如下的很恐怖的版本func ParseUint64(s string) (uint64, error) { if len(s) 0 { return 0, fmt.Errorf(cannot parse uint64 from empty string) } i : uint(0) for ; i uint(len(s)); i { if !(s[i] 0 s[i] 9) { break } } // find end location if i 0 { return 0, fmt.Errorf(cannot parse uint64 from %q, s) } if i uint(len(s)) { // Unparsed tail left. return 0, fmt.Errorf(unparsed tail left after parsing uint64 from %q: %q, s, s[i:]) } var ss *[18]byte (*[18]byte)(unsafe.Pointer(unsafe.StringData(s))) // use jump table and loop unrolling var d uint64 switch i { case 0: d 0 case 1: d uint64(ss[0] - 0) case 2: d uint64(ss[0]-0)*10 uint64(ss[1]-0) case 3: d uint64(ss[0] - 0) d * 10 // n*10 will be optimized by the compiler to (n3) (n1) d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) case 4: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) case 5: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) case 6: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) case 7: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) case 8: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) d * 10 d uint64(ss[7] - 0) case 9: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) d * 10 d uint64(ss[7] - 0) d * 10 d uint64(ss[8] - 0) case 10: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) d * 10 d uint64(ss[7] - 0) d * 10 d uint64(ss[8] - 0) d * 10 d uint64(ss[9] - 0) case 11: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) d * 10 d uint64(ss[7] - 0) d * 10 d uint64(ss[8] - 0) d * 10 d uint64(ss[9] - 0) d * 10 d uint64(ss[10] - 0) case 12: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) d * 10 d uint64(ss[7] - 0) d * 10 d uint64(ss[8] - 0) d * 10 d uint64(ss[9] - 0) d * 10 d uint64(ss[10] - 0) d * 10 d uint64(ss[11] - 0) case 13: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) d * 10 d uint64(ss[7] - 0) d * 10 d uint64(ss[8] - 0) d * 10 d uint64(ss[9] - 0) d * 10 d uint64(ss[10] - 0) d * 10 d uint64(ss[11] - 0) d * 10 d uint64(ss[12] - 0) case 14: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) d * 10 d uint64(ss[7] - 0) d * 10 d uint64(ss[8] - 0) d * 10 d uint64(ss[9] - 0) d * 10 d uint64(ss[10] - 0) d * 10 d uint64(ss[11] - 0) d * 10 d uint64(ss[12] - 0) d * 10 d uint64(ss[13] - 0) case 15: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) d * 10 d uint64(ss[7] - 0) d * 10 d uint64(ss[8] - 0) d * 10 d uint64(ss[9] - 0) d * 10 d uint64(ss[10] - 0) d * 10 d uint64(ss[11] - 0) d * 10 d uint64(ss[12] - 0) d * 10 d uint64(ss[13] - 0) d * 10 d uint64(ss[14] - 0) case 16: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) d * 10 d uint64(ss[7] - 0) d * 10 d uint64(ss[8] - 0) d * 10 d uint64(ss[9] - 0) d * 10 d uint64(ss[10] - 0) d * 10 d uint64(ss[11] - 0) d * 10 d uint64(ss[12] - 0) d * 10 d uint64(ss[13] - 0) d * 10 d uint64(ss[14] - 0) d * 10 d uint64(ss[15] - 0) case 17: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) d * 10 d uint64(ss[7] - 0) d * 10 d uint64(ss[8] - 0) d * 10 d uint64(ss[9] - 0) d * 10 d uint64(ss[10] - 0) d * 10 d uint64(ss[11] - 0) d * 10 d uint64(ss[12] - 0) d * 10 d uint64(ss[13] - 0) d * 10 d uint64(ss[14] - 0) d * 10 d uint64(ss[15] - 0) d * 10 d uint64(ss[16] - 0) case 18: d uint64(ss[0] - 0) d * 10 d uint64(ss[1] - 0) d * 10 d uint64(ss[2] - 0) d * 10 d uint64(ss[3] - 0) d * 10 d uint64(ss[4] - 0) d * 10 d uint64(ss[5] - 0) d * 10 d uint64(ss[6] - 0) d * 10 d uint64(ss[7] - 0) d * 10 d uint64(ss[8] - 0) d * 10 d uint64(ss[9] - 0) d * 10 d uint64(ss[10] - 0) d * 10 d uint64(ss[11] - 0) d * 10 d uint64(ss[12] - 0) d * 10 d uint64(ss[13] - 0) d * 10 d uint64(ss[14] - 0) d * 10 d uint64(ss[15] - 0) d * 10 d uint64(ss[16] - 0) d * 10 d uint64(ss[17] - 0) default: dd, err : strconv.ParseUint(s, 10, 64) if err ! nil { return 0,err } return dd, nil } return d, nil }实际测试发现无论字符串多长处理时间都比较平均。相比原版越长的字符串加速效果越明显但这仍然是不行对于 json 数据短的整形是常见的类型而不是很长的的整形。