架构师 3 期 3 班 -week5- 作业

关注
发布于: 2020 年 12 月 22 日
作业题目用你熟悉的编程语言实现一致性 hash 算法。
编写测试用例测试这个算法，测试 100 万 KV 数据，10 个服务器节点的情况下，计算这些 KV 数据在服务器上分布数量的标准差，以评估算法的存储负载不均衡性。
解答类图
﻿
部署图
﻿
代码实现﻿
Slot 类是虚拟结点类，负责散落在hash环上给实际的CacheNode结点“占坑”。
这里给结点的名字先做了md5 再取hash值，为了让结点散落更平均，避免a-1,a-2,a-3这样的hash值太接近。
﻿
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
public class Slot {
    private int hc;//hashcode
    private String name;//slot的name
    public Slot(String node, int id) {
        this.name = node + "-" + id;
        //避免散列不均衡
        this.hc = md5(this.name).hashCode();
    }
    public int getHc() {
        return hc;
    }
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        Slot slot = (Slot) o;
        return hc == slot.hc;
    }
    public int hashCode() {
        return hc;
    }
    private String md5(String txt) {
        try {
            byte[] b = MessageDigest.getInstance("md5").digest(txt.getBytes("UTF8"));
            StringBuffer stringBuffer = new StringBuffer();
            for (int i = 0; i < b.length; i++) {
                stringBuffer.append(Integer.toHexString((0x000000FF & b[i]) | 0xFFFFFF00).substring(6));
            }
            return stringBuffer.toString();
        }catch (NoSuchAlgorithmException | UnsupportedEncodingException e){
            return null;
        }
    }
}
﻿
CacheNode类是实质的存储结点，提供存储的容器和计算传入的key应该落在哪个Slot上。
这里将Slot集合按hash值升序排列，在选择key的落点时，可以减少一部分循环。
﻿
import java.util.*;
public class CacheNode {
    private String name;//结点名称
    private int count;//虚拟结点数
    private List<Slot> slots;//虚拟结点集合
    private Map<String,Object> container;//实际的存储容器
    public CacheNode(String name, int count) {
        this.name = name;
        this.count = count;
        this.slots = new ArrayList<>(count);
        this.container = new HashMap<>();
        init();
    }
    private void init(){
        for (int i = 0; i < count; i++) {
            Slot slot = new Slot(name,i);
            if (!slots.contains(slot)){
                slots.add(slot);
            }
        }
        //排序为了优化轮训
        slots.sort((o1, o2) -> Integer.compare(o1.getHc(), o2.getHc()));
    }
    public int router(String key){
        for (Slot slot : slots){
            if (key.hashCode() <= slot.getHc())
                return slot.getHc();
        }
        return slots.stream().findFirst().get().getHc();
    }
    public void put(String key,Object value){
        container.put(key,value);
    }
    public Object query(String key){
        return container.get(key);
    }
    public void remove(String key){
        container.remove(key);
    }
    public int getContainerSize(){
        return container.size();
    }
    public String toString() {
        return "CacheNode{" +
                "name='" + name + '\'' +
                ", slot count=" + count +
                ", size =" + container.size() +
                '}';
    }
}
﻿
CacheSupport类是一个提供集群支持的类，提供扩容的add方法、路由方法和集群的状态信息打印。
﻿
import java.math.BigDecimal;
import java.util.*;
public class CacheSupport {
    private List<CacheNode> nodes;
    public CacheSupport() {
        this.nodes = new ArrayList<>();
    }
    public void addNode(CacheNode node){
        this.nodes.add(node);
    }
    public void put(String key,Object value){
        CacheNode cacheNode = router(key);
        cacheNode.put(key,value);
    }
    public Object query(String key){
        CacheNode cacheNode = router(key);
        return cacheNode.query(key);
    }
    public void remove(String key){
        CacheNode cacheNode = router(key);
        cacheNode.remove(key);
    }
    private CacheNode router(String key){
        Map<Integer,CacheNode> result = new HashMap<>(nodes.size());
        ArrayList<Integer> sort = new ArrayList<>();
        for (CacheNode node : nodes){
            int hc = node.router(key);
            if (!result.containsKey(hc)) {
                result.put(hc,node);
                sort.add(hc);
            }
        }
        if (sort.size() > 0){
            sort.sort((o1, o2) -> Integer.compare(o1, o2));
            return result.get(sort.stream().findFirst().get());
        }
        throw new RuntimeException("node not found");
    }
    public void stat(){
        ArrayList<Integer> integers = new ArrayList<>();
        for (CacheNode node : nodes){
            System.out.println(node.toString());
            integers.add(node.getContainerSize());
        }
        System.out.println("总体标准差:"+cal(integers));
    }
    //计算总体标准差:PSD= sqrt(1/(N)*((x1-xm)^2+(x2-xm)^2+..+(xN-xm)^2))
    private double cal(ArrayList<Integer> integers){
        Integer total = integers.stream().collect(Collectors.summingInt(x->x));
        double avg = new BigDecimal(total).divide(new BigDecimal(integers.size())
                ,2,BigDecimal.ROUND_HALF_UP).doubleValue();
        double total_σ2 = 0;
        for (int n : integers){
            total_σ2 += Math.pow(Math.abs(n - avg),2);
        }
        double avg_σ2 = new BigDecimal(total_σ2).divide(new BigDecimal(integers.size())
                ,2,BigDecimal.ROUND_HALF_UP).doubleValue();
        return Math.sqrt(avg_σ2);
    }
}
﻿
测试类测试类测试，计算平均差
import java.util.UUID;
public class Test {
    public static void main(String[] args) {
        CacheSupport support = init();
        for (int i = 0; i < 1000000; i++) {
            String key = UUID.randomUUID().toString().replaceAll("-","");
            support.put(key,i);
        }
        support.stat();
    }
    public static CacheSupport init(){
        String[] names = {"a","b","c","d","e","f","g","h","i","j"};
        int Node_Slot_Count = 150;
        CacheSupport support = new CacheSupport();
        for (int i = 0; i < 10; i++) {
            support.addNode(new CacheNode(names[i],Node_Slot_Count));
        }
        return support;
    }
}
﻿
输出如下：
﻿
CacheNode{name='a', slot count=150, size =111435}
CacheNode{name='a', slot count=150, size =111854}
CacheNode{name='b', slot count=150, size =96863}
CacheNode{name='c', slot count=150, size =101839}
CacheNode{name='d', slot count=150, size =117949}
CacheNode{name='e', slot count=150, size =89642}
CacheNode{name='f', slot count=150, size =92350}
CacheNode{name='g', slot count=150, size =108235}
CacheNode{name='h', slot count=150, size =93825}
CacheNode{name='i', slot count=150, size =100214}
CacheNode{name='j', slot count=150, size =87229}
平均差:9543.622781732312
﻿
这里我用了 50，100，150，300，600，1000，2000，4000，8000几个虚拟结点数做测试
测试下来，
虚拟结点数相同，标准差相对稳定
大的趋势是结点数越多，标准差越小。
虚拟结点数量大到一定程度，标准差变化就很小了，而计算key落地点的轮询时间明显变长。
所以选择150 应该是在时间和存储均衡之间的权衡