自己动手写 Docker 系列 -- 6.5 启动时给容器配置网络
- 2022 年 4 月 25 日
本文字数:9105 字
阅读完需:约 30 分钟
简介
在上几篇文章中,我们使用自己程序创建的网桥和手动配置,让容器的网络功能正常,本篇就利用之前的知识,自动配置容器网络
源码说明
同时放到了 Gitee 和 Github 上,都可进行获取
本章节对应的版本标签是:6.3,防止后面代码过多,不好查看,可切换到标签版本进行查看
代码实现
书中原来的实现基本上是顺序是合理的,但在博主机器上在网络启动部分有些问题,做一些调整,在后面的代码部分会讲到
整体的思路是根据我们上两篇文章:
整体的思路如下:
使用 network 命令创建网桥
程序启动的时候,制定网桥
设置宿主机段的 veth:挂载到网桥,设置启动
设置端口映射,如果有-p 选项
设置容器内的 veth:先进入容器,设置 lo 和 veth,分配 IP,设置启动
在书中是先设置容器内的 veth,在进入容器网络命名空间,在博主机器上有点问题,就做了调整
新增命令选项
在 run 命令中增加-net 和-p 选项,用于制定网桥和端口映射
var RunCommand = cli.Command{
Name: "run",
Usage: `Create a container with namespace and cgroups limit mydocker run -ti [command]`,
Flags: []cli.Flag{
......
cli.StringFlag{
Name: "net",
Usage: "container network",
},
cli.StringSliceFlag{
Name: "p",
Usage: "port mapping",
},
},
Action: func(context *cli.Context) error {
......
network := context.String("net")
portMapping := context.StringSlice("p")
run.Run(tty, detach, cmdArray, resConfig, volume, containerName, envSlice, network, portMapping)
return nil
},
}
启动是配置网络
在 Run 函数中,如果有 net 选项,就为容器配置网络
func Run(tty, detach bool, cmdArray []string, config *subsystem.ResourceConfig, volume, containerName string,
......
if nw != "" {
// 定义网络
_ = network.Init()
containerInfo := &container.ContainerInfo{
ID: id,
Pid: strconv.Itoa(parent.Process.Pid),
Name: containerName,
PortMapping: portMapping,
}
if err := network.Connect(nw, containerInfo); err != nil {
log.Errorf("connnect network err: %v", err)
return
}
}
......
}
配置网络就分为三部分了:配置宿主机 veth、配置宿主机端口映射路由、配置容器 veth
func Connect(networkName string, cinfo *container.ContainerInfo) error {
network, ok := networks[networkName]
if !ok {
return fmt.Errorf("no such network: %s", networkName)
}
// 分配容器IP地址
_, ipNet, _ := net.ParseCIDR(network.Subnet)
ip, err := ipAllocator.Allocate(ipNet)
if err != nil {
return err
}
log.Infof("分配的容器ip地址为: %s", ip)
// 创建网络端点
ep := &Endpoint{
ID: fmt.Sprintf("%s-%s", cinfo.ID, networkName),
IpAddress: ip,
Network: network,
PortMapping: cinfo.PortMapping,
}
// 调用网络驱动挂载和配置网络端点
if err = drivers[network.Driver].Connect(network, ep); err != nil {
log.Errorf("connet err: %v", err)
return err
}
// 配置端口映射
if err = configPortMapping(ep, cinfo); err != nil {
log.Errorf("config port mapping: %v", err)
return err
}
// 到容器的namespace配置容器的网络设备IP地址
if err = configEndpointIpAddressAndRoute(ep, cinfo); err != nil {
log.Errorf("configEndpointIpAddressAndRoute err: %v", err)
return err
}
return nil
}
配置宿主机 veth
这部分是调用原来的驱动的 Connect 函数,这里就不再重复说明了
配置宿主机端口映射路由
设置端口映射就是我们上两篇文字中的配置 iptables 的 DNAT 规则,这里实现也是直接使用的命令
// 配置端口映射
func configPortMapping(ep *Endpoint, cinfo *container.ContainerInfo) error {
log.Infof("端口映射:%s, %s", ep.IpAddress, ep.PortMapping)
// 遍历容器端口映射列表
for _, pm := range ep.PortMapping {
// 分割宿主机的端口和容器的端口
portMapping := strings.Split(pm, ":")
if len(portMapping) != 2 {
log.Errorf("port mapping format err: %v", pm)
continue
}
// 由于iptable没有go语言版本的实现,所以采用exec.command的方式直接调用命令配置
// 在iptables的PREROUTING中添加DNAT规则
// 将宿主机的端口请求转发到容器的地址和端口上
iptableCmd := fmt.Sprintf("-t nat -A PREROUTING -p tcp -m tcp --dport %s -j DNAT --to-destination %s:%s",
portMapping[0], ep.IpAddress.String(), portMapping[1])
log.Infof(iptableCmd)
// 执行iptables命令,添加端口映射和转发规则
cmd := exec.Command("iptables", strings.Split(iptableCmd, " ")...)
output, err := cmd.Output()
if err != nil {
log.Errorf("iptables output err:%s -- %v", output, err)
continue
}
}
return nil
}
配置容器 veth
在书中是先设置了 veth,在进入容器中,将 veth 放入容器中
但经过尝试,这样不生效,需要在将 veth 放入容器后,然后进行配置,才能生效
所有下面的代码进行了调整
// 配置容器网络端点的地址和路由
func configEndpointIpAddressAndRoute(ep *Endpoint, cinfo *container.ContainerInfo) error {
// 通过网络端点中Veth的另一端
peerLink, err := netlink.LinkByName(ep.Device.PeerName)
if err != nil {
return fmt.Errorf("fail config endpoint err: %w", err)
}
log.Infof("peerLink index: %d -- %s", peerLink.Attrs().Index, peerLink.Attrs().Name)
// 将容器的网络端点加入到容器的网络空间中
// 并使这个函数下面的操作都在这个网络空间中进行
// 执行完函数后,恢复为默认的网络空间,具体实现参考具体函数
defer enterContainerNetns(&peerLink, cinfo, ep)
return nil
}
// 将容器的网络端点加入到容器的网络空间中
// 并锁定当前程序所执行的线程,使当前线程进入到容器的网络空间
// 返回值是一个函数指针,执行这个返回函数才会退出容器的网络空间,回归到宿主机的网络空间
// 这个函数中引用了之前介绍的github.com/vishvananda/netns类库来做namespace操作
func enterContainerNetns(enLink *netlink.Link, cinfo *container.ContainerInfo, ep *Endpoint) func() {
log.Infof("enterContainerNetns: %s", cinfo.Pid)
// 找到容器的net namespa
// /proc/{pid}/ns/net打开这个文件的文件描述符就可以来操作net namespace
// 而containInfo中的PID,即容器在宿主机上映射的进程ID
// 它对应的 /proc/{pid}/ns/net 就是容器内部的net namespace
f, err := os.OpenFile(fmt.Sprintf("/proc/%s/ns/net", cinfo.Pid), os.O_RDONLY, 0)
if err != nil {
log.Errorf("error get container net namespace, %v", err)
}
// 取到文件的文件描述符
nsFD := f.Fd()
// 锁定当前程序所执行的线程,如果不锁定操作系统线程的话
// go 语言的groutine可能会调度到别的线程上去
// 就不能保证一致在所需要的网络空间中
// 所以调用runtime。lockOSThread时要先锁定当前程序锁定的线程
runtime.LockOSThread()
// 修改veth peer另外一端移动容器的namespace中
if err = netlink.LinkSetNsFd(*enLink, int(nsFD)); err != nil {
log.Errorf("error set link netns, %v", err)
}
// 获取当前网络的namespace
originNet, err := netns.Get()
if err != nil {
log.Errorf("get current netns err: %v", err)
}
// 设置当前进程到新的网络namespace,并咋函数执行完成后,再恢复到之前的namespace
if err = netns.Set(netns.NsHandle(nsFD)); err != nil {
log.Errorf("error set netns, %v", err)
}
// 获取到容器的ip地址及网段,用于配置容器内部接口地址
// 比如容器IP是192.168.1.2,而网络的网段是192.168.1.0/24
// 那么这里产出的ip字符串就是192.168.1.2/24,用于容器内veth的配置
interfaceIp := *ep.Network.IpRange
interfaceIp.IP = ep.IpAddress
log.Infof("容器的ip地址和网段:%s, %s", interfaceIp.String(), interfaceIp.IP.String())
// 启动容器内的veth端点
if err = setInterfaceUp(ep.Device.PeerName); err != nil {
//return fmt.Errorf("setInterfaceUp ip %s, err: %w", ep.Device.PeerName, err)
log.Errorf("setInterfaceUp ip %s, err: %w", ep.Device.PeerName, err)
}
// 调用函数设置容器内的Veth端点的IP
if err = setInterfaceIp(ep.Device.PeerName, interfaceIp.String()); err != nil {
//return fmt.Errorf("setinterface ip %v, err: %w", ep.Network, err)
log.Errorf("setinterface ip %v, err: %w", ep.Network, err)
}
// net namespace中默认的本地地址是127.0.0.1 的lo网卡关闭状态
// 启动它以保证容器访问自己的请求
if err = setInterfaceUp("lo"); err != nil {
//return fmt.Errorf("setInterfaceUp ip lo, err: %w", err)
log.Errorf("setInterfaceUp ip lo, err: %w", err)
}
// 设置容器内的外部请求都通过容器内的veth端点网络
// 0.0.0.0/0 的网段,标识所有的ip地址段
_, cidr, _ := net.ParseCIDR("0.0.0.0/0")
// 构建要添加的路由数据,包括网络设备、网关IP及目的网段
// 相当于route add -net 0.0.0.0/0 gw {bridge 网桥地址} dev {容器内的veth端点设备}
defaultRoute := &netlink.Route{
LinkIndex: (*enLink).Attrs().Index,
Gw: ep.Network.GatewayIP,
Dst: cidr,
}
log.Infof("default route: %s", defaultRoute.String())
// 调用netlink的routeAdd,添加路由到容器内的网络空间
// routeadd函数相当于route add命令
if err = netlink.RouteAdd(defaultRoute); err != nil {
//return fmt.Errorf("add route err: %w", err)
log.Errorf("add route err: %w", err)
}
// 返回之前的net namespace
// 在容器的网络空间中,执行完容器配置之后,调用此函数就可以将程序恢复到原生的net namespace
return func() {
// 恢复到上面获取到的之前的net namespace
_ = netns.Set(originNet)
// 关闭namespace
_ = originNet.Close()
// 取消对当前程序的线程锁定
runtime.UnlockOSThread()
// 关闭namespace文件
_ = f.Close()
log.Infof("退出容器网络空间")
}
}
测试运行
先启动第一个容器,看看是否正常使用
# 创建网络
./main network create --driver bridge --subnet 192.168.10.1/24 testbridge
# 启动容器
$ ./main run -ti -net testbridge -name test1 sh
{"level":"info","msg":"memory cgroup path: /sys/fs/cgroup/memory/mydocker-cgroup","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"memory cgroup path: /sys/fs/cgroup/memory/mydocker-cgroup","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"load ipam file from: /var/run/mydocker/network/ipam/subnet.json","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"allocate subnet: 162.16.0.4/24, ip: 162.16.0.4","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"dump ipam file from: /var/run/mydocker/network/ipam/","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"分配的容器ip地址为: 162.16.0.4","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"init come on","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"current location: /home/lw/code/go/dockerDemo/mnt","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"peerLink index: 10 -- cif-21765","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"enterContainerNetns: 7789","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"容器的ip地址和网段:162.16.0.4/24, 162.16.0.4","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"ip link set up: cif-21765 -- 10","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"add addr, iface: 10 -- cif-21765, ip: 162.16.0.4/24","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"ip link set up: lo -- 1","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"default route: {Ifindex: 10 Dst: 0.0.0.0/0 Src: \u003cnil\u003e Gw: 162.16.0.1 Flags: [] Table: 0}","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"all command is : sh","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"parent process run","time":"2022-04-24T05:25:03+08:00"}
{"level":"info","msg":"find path: /bin/sh","time":"2022-04-24T05:25:03+08:00"}
# 看到lo和veth都设置正常
/ # ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
10: cif-21765@if11: <BROADCAST,MULTICAST,UP,LOWER_UP,M-DOWN> mtu 1500 qdisc noqueue qlen 1000
link/ether 5e:a5:ca:22:8e:53 brd ff:ff:ff:ff:ff:ff
inet 162.16.0.4/24 brd 162.16.0.255 scope global cif-21765
valid_lft forever preferred_lft forever
inet6 fe80::5ca5:caff:fe22:8e53/64 scope link
valid_lft forever preferred_lft forever
# 路由也是设置正常
/ # ip r
default via 162.16.0.1 dev cif-21765
162.16.0.0/24 dev cif-21765 scope link src 162.16.0.4
# ping 网关正常
/ # ping 162.16.0.1
PING 162.16.0.1 (162.16.0.1): 56 data bytes
64 bytes from 162.16.0.1: seq=0 ttl=64 time=0.136 ms
64 bytes from 162.16.0.1: seq=1 ttl=64 time=0.146 ms
^C
--- 162.16.0.1 ping statistics ---
2 packets transmitted, 2 packets received, 0% packet loss
round-trip min/avg/max = 0.136/0.141/0.146 ms
# ping 宿主机正常
/ # ping 192.168.1.5
PING 192.168.1.5 (192.168.1.5): 56 data bytes
64 bytes from 192.168.1.5: seq=0 ttl=64 time=0.206 ms
64 bytes from 192.168.1.5: seq=1 ttl=64 time=0.163 ms
64 bytes from 192.168.1.5: seq=2 ttl=64 time=0.148 ms
^C
--- 192.168.1.5 ping statistics ---
3 packets transmitted, 3 packets received, 0% packet loss
round-trip min/avg/max = 0.148/0.172/0.206 ms
# ping 外网正常
/ # ping 114.114.114.114
PING 114.114.114.114 (114.114.114.114): 56 data bytes
64 bytes from 114.114.114.114: seq=0 ttl=67 time=35.061 ms
64 bytes from 114.114.114.114: seq=1 ttl=70 time=34.040 ms
^C
--- 114.114.114.114 ping statistics ---
3 packets transmitted, 2 packets received, 33% packet loss
round-trip min/avg/max = 34.040/34.550/35.061 ms
启动第二个带端口映射的容器
# 启动第二个容器
$ ./main run -ti -net testbridge -name test2 -p 90:90 sh
{"level":"info","msg":"memory cgroup path: /sys/fs/cgroup/memory/mydocker-cgroup","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"memory cgroup path: /sys/fs/cgroup/memory/mydocker-cgroup","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"load ipam file from: /var/run/mydocker/network/ipam/subnet.json","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"allocate subnet: 162.16.0.8/24, ip: 162.16.0.8","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"dump ipam file from: /var/run/mydocker/network/ipam/","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"分配的容器ip地址为: 162.16.0.8","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"init come on","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"current location: /home/lw/code/go/dockerDemo/mnt","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"端口映射:162.16.0.8, [90:90]","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"-t nat -A PREROUTING -p tcp -m tcp --dport 90 -j DNAT --to-destination 162.16.0.8:90","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"peerLink index: 18 -- cif-76526","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"enterContainerNetns: 9443","time":"2022-04-24T05:37:19+08:00"}
{"level":"info","msg":"容器的ip地址和网段:162.16.0.8/24, 162.16.0.8","time":"2022-04-24T05:37:20+08:00"}
{"level":"info","msg":"ip link set up: cif-76526 -- 18","time":"2022-04-24T05:37:20+08:00"}
{"level":"info","msg":"add addr, iface: 18 -- cif-76526, ip: 162.16.0.8/24","time":"2022-04-24T05:37:20+08:00"}
{"level":"info","msg":"ip link set up: lo -- 1","time":"2022-04-24T05:37:20+08:00"}
{"level":"info","msg":"default route: {Ifindex: 18 Dst: 0.0.0.0/0 Src: \u003cnil\u003e Gw: 162.16.0.1 Flags: [] Table: 0}","time":"2022-04-24T05:37:20+08:00"}
{"level":"info","msg":"all command is : sh","time":"2022-04-24T05:37:20+08:00"}
{"level":"info","msg":"parent process run","time":"2022-04-24T05:37:20+08:00"}
{"level":"info","msg":"find path: /bin/sh","time":"2022-04-24T05:37:20+08:00"}
# 监听容器的90端口,可以看到从外网使用Telnet访问正常
/ # nc -lp 90
fdsj
^Cpunt!
# ping我们的第一个容器正常
/ # ping 162.16.0.4
PING 162.16.0.4 (162.16.0.4): 56 data bytes
64 bytes from 162.16.0.4: seq=0 ttl=64 time=0.119 ms
64 bytes from 162.16.0.4: seq=1 ttl=64 time=0.150 ms
64 bytes from 162.16.0.4: seq=2 ttl=64 time=0.148 ms
64 bytes from 162.16.0.4: seq=3 ttl=64 time=0.147 ms
64 bytes from 162.16.0.4: seq=4 ttl=64 time=0.149 ms
^C
--- 162.16.0.4 ping statistics ---
5 packets transmitted, 5 packets received, 0% packet loss
round-trip min/avg/max = 0.119/0.142/0.150 ms
# ping 外网正常
/ # ping 114.114.114.114
PING 114.114.114.114 (114.114.114.114): 56 data bytes
64 bytes from 114.114.114.114: seq=0 ttl=75 time=34.561 ms
64 bytes from 114.114.114.114: seq=1 ttl=69 time=33.634 ms
^C
--- 114.114.114.114 ping statistics ---
2 packets transmitted, 2 packets received, 0% packet loss
round-trip min/avg/max = 33.634/34.097/34.561 ms
我们回到宿主机,看到下面的网课和 IPtables 配置和我们预期的一样
# 看到了设置的相关的规则
$ iptables -t nat -nL --line-numbers
Chain PREROUTING (policy ACCEPT)
num target prot opt source destination
1 DOCKER all -- 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type LOCAL
2 DNAT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:90 to:162.16.0.8:90
Chain INPUT (policy ACCEPT)
num target prot opt source destination
Chain OUTPUT (policy ACCEPT)
num target prot opt source destination
1 DOCKER all -- 0.0.0.0/0 !127.0.0.0/8 ADDRTYPE match dst-type LOCAL
Chain POSTROUTING (policy ACCEPT)
num target prot opt source destination
1 MASQUERADE all -- 172.17.0.0/16 0.0.0.0/0
2 MASQUERADE all -- 162.16.0.0/24 0.0.0.0/0
Chain DOCKER (2 references)
num target prot opt source destination
1 RETURN all -- 0.0.0.0/0 0.0.0.0/0
# ip查看,有我们启动的两个容器veth
$ ip a
5: testbridge: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 42:2a:4b:44:dd:0c brd ff:ff:ff:ff:ff:ff
inet 162.16.0.1/24 brd 162.16.0.255 scope global testbridge
valid_lft forever preferred_lft forever
inet6 fe80::acda:9eff:fe9d:e125/64 scope link
valid_lft forever preferred_lft forever
11: 21765@if10: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master testbridge state UP group default qlen 1000
link/ether 42:2a:4b:44:dd:0c brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet6 fe80::402a:4bff:fe44:dd0c/64 scope link
valid_lft forever preferred_lft forever
19: 76526@if18: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master testbridge state UP group default qlen 1000
link/ether e6:d6:ef:8e:90:7c brd ff:ff:ff:ff:ff:ff link-netnsid 1
inet6 fe80::e4d6:efff:fe8e:907c/64 scope link
valid_lft forever preferred_lft forever
总结
这个网络部分总算是基本完了,这部分卡了快两周了,看了好多书和资料
本篇结合之前的手动配置知识,将容器的网络进行配置
版权声明: 本文为 InfoQ 作者【萧】的原创文章。
原文链接:【http://xie.infoq.cn/article/a8ca1ec12e4c267a0b08b2d4b】。
本文遵守【CC-BY 4.0】协议,转载请保留原文出处及本版权声明。
萧
还未添加个人签名 2018.09.09 加入
代码是门手艺活,也是门艺术活
评论