架构师训练营第五周命题作业
发布于: 2020 年 10 月 25 日
1. 用你熟悉的编程语言实现一致性 hash 算法。
2. 编写测试用例测试这个算法,测试 100 万 KV 数据,10 个服务器节点的情况下,计算这些 KV 数据在服务器上分布数量的标准差,以评估算法的存储负载不均衡性。
字符串生成 hash 值和标准差的计算方法,在网上搜了下写法。其他是自己写的。
程序:
package main
import (
"fmt"
"hash/crc32"
"log"
"math"
"sort"
)
type RealNode struct {
Name string
VirtualNodes []*VirtualNode
}
type VirtualNode struct {
RealNode *RealNode
Name string
Hash int
}
type Distribution struct {
StdDev float64
Exp float64
}
func (node *RealNode) Show() {
log.Println("RealNode begin")
log.Printf("RealNode: %+v", node)
for _, child := range node.VirtualNodes {
log.Printf("VirtualNode: %+v", child)
}
log.Println("RealNode end")
}
func (node *RealNode) SetupVirtualNodes(size int) {
node.VirtualNodes = make([]*VirtualNode, size)
index := 0
for index < size {
name := generateVirtualNodeName(node, index)
node.VirtualNodes[index] = &VirtualNode{
RealNode: node,
Name: name,
Hash: Hash(name),
}
index++
}
}
func main() {
testOneRound(10, 50)
testOneRound(10, 60)
testOneRound(10, 70)
testOneRound(10, 80)
testOneRound(10, 90)
testOneRound(10, 100)
testOneRound(10, 110)
testOneRound(10, 120)
testOneRound(10, 130)
testOneRound(10, 140)
testOneRound(10, 150)
testOneRound(10, 160)
testOneRound(10, 170)
testOneRound(10, 180)
testOneRound(10, 190)
testOneRound(10, 200)
testOneRound(10, 300)
testOneRound(10, 400)
testOneRound(10, 500)
testOneRound(10, 1000)
testOneRound(10, 1500)
testOneRound(10, 2000)
}
func testOneRound(realNodeCount int, virtualNodeCount int) {
log.Println("virtualNodeCount: ", virtualNodeCount)
realNodes := initRealNodes(realNodeCount, virtualNodeCount)
virtualNodes, _ := getSortedVirtualNodes(realNodes)
assertNoDuplicateHash(virtualNodes)
mockData := generateMockData(10000 * 100)
node2Times := hitNodes(mockData, virtualNodes)
showDistribution(node2Times)
log.Println()
}
func generateMockData(count int) []string {
data := make([]string, count)
n := 0
prefix := "test-string-%d"
for n < count {
data[n] = fmt.Sprintf(prefix, n)
n++
}
return data
}
func hitNodes(source []string, virtualNodes []*VirtualNode) map[string]int {
node2Times := make(map[string]int, 0)
n := 0
for _, str := range source {
targetNode := findNode(str, virtualNodes)
if v, found := node2Times[targetNode.RealNode.Name]; found {
node2Times[targetNode.RealNode.Name] = v + 1
} else {
node2Times[targetNode.RealNode.Name] = 1
}
n++
}
return node2Times
}
func showDistribution(node2Times map[string]int) {
hitTimes := make([]float64, 0)
for _, v := range node2Times {
hitTimes = append(hitTimes, float64(v))
}
distribution := distribution(hitTimes)
log.Printf("stddev: %.2f, exp: %.2f", distribution.StdDev, distribution.Exp)
}
func initRealNodes(realNodeCount int, virtualNodeCount int) []*RealNode {
nodes := make([]*RealNode, realNodeCount)
index := 0
for index < realNodeCount {
node := &RealNode{Name: generateRealNodeName(index)}
node.SetupVirtualNodes(virtualNodeCount)
nodes[index] = node
index++
}
return nodes
}
func getSortedVirtualNodes(nodes []*RealNode) ([]*VirtualNode, map[int]*VirtualNode) {
virtualNodes := make([]*VirtualNode, 0)
hash2VirtualNode := make(map[int]*VirtualNode, 0)
for _, node := range nodes {
for _, vnode := range node.VirtualNodes {
virtualNodes = append(virtualNodes, vnode)
hash2VirtualNode[vnode.Hash] = vnode
}
}
sort.Slice(virtualNodes, func(i, j int) bool {
return virtualNodes[i].Hash < virtualNodes[j].Hash
})
return virtualNodes, hash2VirtualNode
}
func assertNoDuplicateHash(nodes []*VirtualNode) {
hash := make(map[int]struct{}, 0)
for _, node := range nodes {
if _, found := hash[node.Hash]; found {
panic(fmt.Sprintf("Duplicate hash %d", node.Hash))
return
} else {
hash[node.Hash] = struct{}{}
}
}
}
func findNode(value string, virtualNodes []*VirtualNode) *VirtualNode {
hash := Hash(value)
for _, node := range virtualNodes {
if hash <= node.Hash {
return node
}
}
//value比最后一个节点的hash大,所以要落到环的第一个节点
return virtualNodes[0]
}
func generateRealNodeName(index int) string {
return fmt.Sprintf("RealNode-%d", index)
}
func generateVirtualNodeName(realNode *RealNode, childIndex int) string {
return fmt.Sprintf("%s-%d", realNode.Name, childIndex)
}
// https://blog.csdn.net/lanyang123456/article/details/79827101?utm_medium=distribute.pc_relevant.none-task-blog-title-9&spm=1001.2101.3001.4242
// Hash hashes a string to a unique hashcode.
// crc32 returns a uint32, but for our use we need a non negative integer. Here we cast to an integer and invert it if the result is negative.
func Hash(s string) int {
v := int(crc32.ChecksumIEEE([]byte(s)))
if v >= 0 {
return v
} else {
return -v
}
}
func distribution(dataSource []float64) *Distribution {
//数学期望
var sum float64 = 0
for _, v := range dataSource {
sum += v
}
exp := float64(sum) / float64(len(dataSource))
//标准差
var variance float64
for _, v := range dataSource {
variance += math.Pow((v - exp), 2)
}
stddev := math.Sqrt(variance / float64(len(dataSource)))
//fmt.Println("stddev:", stddev)
//fmt.Println("exp:", exp)
//正态分布公式
//a := 1 / (math.Sqrt(2*math.Pi) * stddev) * math.Pow(math.E, (-math.Pow((exp-exp), 2)/(2*math.Pow(stddev, 2))))
//fmt.Println(a)
return &Distribution{
StdDev: stddev,
Exp: exp,
}
}
复制代码
测试结果:
//虚拟节点数 标准差
//50 15438.54
//60 16700.24
//70 17957.28
//80 18598.59
//90 17580.13
//100 17371.79
//110 16866.18
//120 14568.59
//130 13964.82
//140 13772.18
//150 15097.51
//160 17097.60
//170 17416.83
//180 18204.47
//190 16835.38
//200 17621.08
//300 18522.42
//400 14791.96
//500 14673.12
//1000 11938.21
//1500 13548.18
//2000 11840.77
复制代码
划线
评论
复制
发布于: 2020 年 10 月 25 日阅读数: 39
版权声明: 本文为 InfoQ 作者【一马行千里】的原创文章。
原文链接:【http://xie.infoq.cn/article/78bc453f78cac83b33bb26f0b】。文章转载请联系作者。
一马行千里
关注
还未添加个人签名 2018.07.26 加入
还未添加个人简介
评论