Golang服務器熱重啓、熱升級、熱更新(safe and graceful hot-restart/reload http server)詳解

服務端代碼常常須要升級,對於線上系統的升級經常使用的作法是,經過前端的負載均衡(如nginx)來保證升級時至少有一個服務可用,依次(灰度)升級。html

而另外一種更方便的方法是在應用上作熱重啓,直接更新源碼、配置或升級應用而不停服務。前端

這個功能在重要業務上尤其重要,會影響服務可用性、用戶體驗。linux

原理

熱重啓的原理比較簡單,可是涉及到一些系統調用以及父子進程之間文件句柄的傳遞等等細節比較多。
處理過程分爲如下幾個步驟:nginx

  1. 監聽信號(USR2..)
  2. 收到信號時fork子進程(使用相同的啓動命令),將服務監聽的socket文件描述符傳遞給子進程
  3. 子進程監聽父進程的socket,這個時候父進程和子進程均可以接收請求
  4. 子進程啓動成功以後,父進程中止接收新的鏈接,等待舊鏈接處理完成(或超時)
  5. 父進程退出,重啓完成

細節

  • 父進程將socket文件描述符傳遞給子進程能夠經過命令行,或者環境變量等
  • 子進程啓動時使用和父進程同樣的命令行,對於golang來講用更新的可執行程序覆蓋舊程序
  • server.Shutdown()優雅關閉方法是go>=1.8的新特性
  • server.Serve(l)方法在Shutdown時當即返回,Shutdown方法則阻塞至context完成,因此Shutdown的方法要寫在主goroutine中

代碼

package main

import (
    "context"
    "errors"
    "flag"
    "log"
    "net"
    "net/http"
    "os"
    "os/exec"
    "os/signal"
    "syscall"
    "time"
)

var (
    server   *http.Server
    listener net.Listener
    graceful = flag.Bool("graceful", false, "listen on fd open 3 (internal use only)")
)

func handler(w http.ResponseWriter, r *http.Request) {
    time.Sleep(20 * time.Second)
    w.Write([]byte("hello world233333!!!!"))
}

func main() {
    flag.Parse()

    http.HandleFunc("/hello", handler)
    server = &http.Server{Addr: ":9999"}

    var err error
    if *graceful {
        log.Print("main: Listening to existing file descriptor 3.")
        // cmd.ExtraFiles: If non-nil, entry i becomes file descriptor 3+i.
        // when we put socket FD at the first entry, it will always be 3(0+3)
     //爲何是3呢,而不是1 0 或者其餘數字?這是由於父進程裏給了個fd給子進程了 而子進程裏0,1,2是預留給 標準輸入、輸出和錯誤的,因此父進程給的第一個fd在子進程裏順序排就是從3開始了;若是fork的時候cmd.ExtraFiles給了兩個文件句柄,那麼子進程裏還能夠用4開始,就看你開了幾個子進程自增就行。由於我這裏就開一個子進程因此把3寫死了。這一步只是把 fd描述符包裝進這個結構體。
f := os.NewFile(3, "")
     //先複製fd到新的fd, 而後設置子進程exec時自動關閉父進程的fd,即「F_DUPFD_CLOEXEC」 listener, err =
net.FileListener(f) } else { log.Print("main: Listening on a new file descriptor.") listener, err = net.Listen("tcp", server.Addr) } if err != nil { log.Fatalf("listener error: %v", err) } go func() { // server.Shutdown() stops Serve() immediately, thus server.Serve() should not be in main goroutine err = server.Serve(listener) log.Printf("server.Serve err: %v\n", err) }() signalHandler() log.Printf("signal end") } func reload() error { tl, ok := listener.(*net.TCPListener) if !ok { return errors.New("listener is not tcp listener") } f, err := tl.File() if err != nil { return err } args := []string{"-graceful"} cmd := exec.Command(os.Args[0], args...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr // put socket FD at the first entry cmd.ExtraFiles = []*os.File{f} return cmd.Start() } func signalHandler() { ch := make(chan os.Signal, 1) signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR2) for { sig := <-ch log.Printf("signal: %v", sig) // timeout context for shutdown ctx, _ := context.WithTimeout(context.Background(), 20*time.Second) switch sig { case syscall.SIGINT, syscall.SIGTERM: // stop log.Printf("stop") signal.Stop(ch) server.Shutdown(ctx) log.Printf("graceful shutdown") return case syscall.SIGUSR2: // reload log.Printf("reload") err := reload() if err != nil { log.Fatalf("graceful restart error: %v", err) } server.Shutdown(ctx) log.Printf("graceful reload") return } } }l, err = net.FileListener(f)TCPListener

 

個人實現

 
  
package main

import (
"net"
"net/http"
"time"
"log"
"syscall"
"os"
"os/signal"
"context"
"fmt"
"os/exec"
"flag"
)
var (
listener net.Listener
err error
server http.Server
graceful = flag.Bool("g", false, "listen on fd open 3 (internal use only)")
)

type MyHandler struct {

}

func (*MyHandler)ServeHTTP(w http.ResponseWriter, r *http.Request){
fmt.Println("request start at ", time.Now(), r.URL.Path+"?"+r.URL.RawQuery, "request done at ", time.Now(), " pid:", os.Getpid())
time.Sleep(10 * time.Second)
w.Write([]byte("this is test response"))
fmt.Println("request done at ", time.Now(), " pid:", os.Getpid() )

}

func main() {
flag.Parse()
fmt.Println("start-up at " , time.Now(), *graceful)
if *graceful {
f := os.NewFile(3, "")
listener, err = net.FileListener(f)
fmt.Printf( "graceful-reborn %v %v %#v \n", f.Fd(), f.Name(), listener)
}else{
listener, err = net.Listen("tcp", ":1111")
tcp,_ := listener.(*net.TCPListener)
fd,_ := tcp.File()
fmt.Printf( "first-boot %v %v %#v \n ", fd.Fd(),fd.Name(), listener)
}


server := http.Server{
Handler: &MyHandler{},
ReadTimeout: 6 * time.Second,
}
log.Printf("Actual pid is %d\n", syscall.Getpid())
if err != nil {
println(err)
return
}
log.Printf(" listener: %v\n", listener)
go func(){//不要阻塞主進程
err := server.Serve(listener)
if err != nil {
log.Println(err)
}
}()

//signals
func(){
ch := make(chan os.Signal, 1)
signal.Notify(ch, syscall.SIGHUP, syscall.SIGTERM)
for{//阻塞主進程, 不停的監聽系統信號
sig := <- ch
log.Printf("signal: %v", sig)
ctx, _ := context.WithTimeout(context.Background(), 20*time.Second)
switch sig {
case syscall.SIGTERM, syscall.SIGHUP:
println("signal cause reloading")
signal.Stop(ch)
{//fork new child process
tl, ok := listener.(*net.TCPListener)
if !ok {
fmt.Println("listener is not tcp listener")
return
}
currentFD, err := tl.File()
if err != nil {
fmt.Println("acquiring listener file failed")
return
}
cmd := exec.Command(os.Args[0], "-g")
cmd.ExtraFiles, cmd.Stdout,cmd.Stderr = []*os.File{currentFD} ,os.Stdout, os.Stderr
err = cmd.Start()

if err != nil {
fmt.Println("cmd.Start fail: ", err)
return
}
fmt.Println("forked new pid : ",cmd.Process.Pid)
}

server.Shutdown(ctx)
fmt.Println("graceful shutdown at ", time.Now())
}

}
}()
}
 
qiangjian@sun-pro:/data1/works/IdeaProjects/go_core$ go run src/wright/hotrestart/booter.go  
start-up at  2018-10-12 15:29:34.586269 +0800 CST m=+0.004439497 false
first-boot  5 tcp:[::]:1111-> &net.TCPListener{fd:(*net.netFD)(0xc00010e000)} 
 2018/10/12 15:29:34 Actual pid is 10771
2018/10/12 15:29:34  listener: &{0xc00010e000}
request start at  2018-10-12 15:29:40.287928 +0800 CST m=+5.705965906 /aa/bb?c=d request done at  2018-10-12 15:29:40.287929 +0800 CST m=+5.705966554   pid: 10771
2018/10/12 15:29:49 signal: terminated
signal cause reloading
forked new pid :  10775
start-up at  2018-10-12 15:29:49.689064 +0800 CST m=+0.001613279 true
graceful-reborn  3   &net.TCPListener{fd:(*net.netFD)(0xc0000ec000)} 
2018/10/12 15:29:49 Actual pid is 10775
2018/10/12 15:29:49  listener: &{0xc0000ec000}
request done at  2018-10-12 15:29:50.288525 +0800 CST m=+15.706330718   pid: 10771
2018/10/12 15:29:50 http: Server closed
request start at  2018-10-12 15:29:50.290622 +0800 CST m=+15.708426906 /aa/bb?c=d request done at  2018-10-12 15:29:50.290623 +0800 CST m=+15.708428113   pid: 10771
request start at  2018-10-12 15:29:50.290713 +0800 CST m=+0.603248262 /aa/bb?c=d request done at  2018-10-12 15:29:50.290714 +0800 CST m=+0.603249293   pid: 10775
request done at  2018-10-12 15:30:00.293988 +0800 CST m=+10.606290169   pid: 10775
request done at  2018-10-12 15:30:00.294043 +0800 CST m=+25.711615717   pid: 10771
request start at  2018-10-12 15:30:00.295554 +0800 CST m=+10.607856283 /aa/bb?c=d request done at  2018-10-12 15:30:00.295555 +0800 CST m=+10.607857307   pid: 10775
request start at  2018-10-12 15:30:00.29558 +0800 CST m=+10.607881997 /aa/bb?c=d request done at  2018-10-12 15:30:00.295581 +0800 CST m=+10.607883004   pid: 10775
graceful shutdown at  2018-10-12 15:30:00.79544 +0800 CST m=+26.213000502
ab -v -k -c2 -n100 '127.0.0.1:1111/aa/bb?c=d'
This is ApacheBench, Version 2.3 <$Revision: 1826891 $>
Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/
Licensed to The Apache Software Foundation, http://www.apache.org/

Benchmarking 127.0.0.1 (be patient)...^C

Server Software:        
Server Hostname:        127.0.0.1
Server Port:            1111

Document Path:          /aa/bb?c=d
Document Length:        21 bytes

Concurrency Level:      2
Time taken for tests:   48.292 seconds
Complete requests:      7
Failed requests:        0
Total transferred:      966 bytes
HTML transferred:       147 bytes
Requests per second:    0.14 [#/sec] (mean)
Time per request:       13797.702 [ms] (mean)
Time per request:       6898.851 [ms] (mean, across all concurrent requests)
Transfer rate:          0.02 [Kbytes/sec] received
kill 進程ID  #發送TERM信號

 

//還有一種方式去fork,和上面本質同樣:
execSpec := &syscall.ProcAttr{
    Env:   os.Environ(),
    Files: []uintptr{os.Stdin.Fd(), os.Stdout.Fd(), os.Stderr.Fd(), lFd},
}
pid, err := syscall.ForkExec(os.Args[0], os.Args, execSpec)

 

 

 

能夠看出: ab測試器Failed爲0,且console中顯示老請求處理完後才shutdown,即在kill觸發reload後,請求不管是老進程的舊請求,仍是fork子進程後的新請求,全都處理成功,沒有失敗的。 這就是咱們說的熱重啓!git

systemd & supervisor

父進程退出以後,子進程會掛到1號進程上面。這種狀況下使用systemd和supervisord等管理程序會顯示進程處於failed的狀態。解決這個問題有兩個方法:github

  • 使用pidfile,每次進程重啓更新一下pidfile,讓進程管理者經過這個文件感知到main pid的變動。
  • 更通用的作法:起一個master來管理服務進程,每次熱重啓master拉起一個新的進程,把舊的kill掉。這時master的pid沒有變化,對於進程管理者來講進程處於正常的狀態。一個簡潔的實現

FD複製時細節

請看:golang

https://blog.csdn.net/ChrisNiu1984/article/details/7050663apache

http://man7.org/linux/man-pages/man2/fcntl.2.html#F_DUPFD_CLOEXEC負載均衡

 

References

相關文章
相關標籤/搜索