Skip to content

Commit

Permalink
Merge pull request #320 from mesosphere/optional-zk-timeout
Browse files Browse the repository at this point in the history
Implement optional ZK detection timeout with resets on watch loss
  • Loading branch information
tsenart committed Oct 9, 2015
2 parents 248895a + 63b7331 commit c662855
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 57 deletions.
18 changes: 0 additions & 18 deletions detect/masters.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,6 @@ func NewMasters(masters []string, changed chan<- []string) *Masters {
// It implements the detector.MasterChanged interface.
func (ms *Masters) OnMasterChanged(leader *mesos.MasterInfo) {
logging.VeryVerbose.Println("Updated leader: ", leader)

if leader == nil {
logging.Error.Println("No master available in Zookeeper.")
return
}

ms.masters = ordered(masterAddr(leader), ms.masters[1:])
emit(ms.changed, ms.masters)
}
Expand All @@ -60,24 +54,12 @@ func (ms *Masters) OnMasterChanged(leader *mesos.MasterInfo) {
// It implements the detector.AllMasters interface.
func (ms *Masters) UpdatedMasters(infos []*mesos.MasterInfo) {
logging.VeryVerbose.Println("Updated masters: ", infos)

if infos == nil {
logging.Error.Println("No masters available in Zookeeper.")
return
}

masters := make([]string, 0, len(infos))
for _, info := range infos {
if addr := masterAddr(info); addr != "" {
masters = append(masters, addr)
}
}

if len(masters) == 0 {
logging.Error.Println("No valid masters available in Zookeeper.")
return
}

ms.masters = ordered(ms.masters[0], masters)
emit(ms.changed, ms.masters)
}
Expand Down
16 changes: 8 additions & 8 deletions detect/masters_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,21 @@ func TestMasters_UpdatedMasters(t *testing.T) {
},
{
// update additional masters with an empty slice
// expect no update at all (nil)
// expect empty masters
masterInfos(),
nil,
[]string{""},
},
{
// update masters with a niladic value
// expect no update at all (nil)
nil,
// expect empty masters
nil,
[]string{""},
},
} {
m.UpdatedMasters(tt.masters)

if got := recv(ch); !reflect.DeepEqual(got, tt.want) {
t.Errorf("test #%d: got %v, want: %v", i, got, tt.want)
t.Errorf("test #%d: got %#v, want: %#v", i, got, tt.want)
}
}
}
Expand Down Expand Up @@ -93,15 +93,15 @@ func TestMasters_OnMasterChanged(t *testing.T) {
},
{
// update new leader with a niladic value
// expect no update at all (nil)
nil,
// expect empty leader
nil,
[]string{""},
},
} {
m.OnMasterChanged(tt.leader)

if got := recv(ch); !reflect.DeepEqual(got, tt.want) {
t.Errorf("test #%d: got %v, want: %v", i, got, tt.want)
t.Errorf("test #%d: got %#v, want: %#v", i, got, tt.want)
}
}
}
Expand Down
12 changes: 12 additions & 0 deletions docs/docs/configuration-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ The configuration file should include the following fields:

`zk` is a link to the Zookeeper instances on the Mesos cluster. Its format is `zk://host1:port1,host2:port2/mesos/`, where the number of hosts can be one or more. The default port for Zookeeper is `2181`. Mesos-DNS will monitor the Zookeeper instances to detect the current leading master.

`zkDetectionTimeout` defines how long to wait (in seconds) for Zookeeper to report a new leading Mesos master.
This timeout is activated on:

- Start up, where it plays the role of the "initial leader detection timeout" via ZK.
- Mesos cluster changes, where there is no leading master for some period of time.
- Zookeeper or network failure, when losing connection to the ZK cluster.

If a *non-zero* timeout is specified and the timeout threshold is exceeded before
a new leading Mesos master is reported by the ZK-based master detector, the program will exit.

Defaults to `30` seconds.

`masters` is a comma separated list with the IP address and port number for the master(s) in the Mesos cluster. Mesos-DNS will automatically find the leading master at any point in order to retrieve state about running tasks. If there is no leading master or the leading master is not responsive, Mesos-DNS will continue serving DNS requests based on stale information about running tasks. The `masters` field is required.

It is sufficient to specify just one of the `zk` or `masters` field. If both are defined, Mesos-DNS will first attempt to detect the leading master through Zookeeper. If Zookeeper is not responding, it will fall back to using the `masters` field. Both `zk` and `master` fields are static. To update them you need to restart Mesos-DNS. We recommend you use the `zk` field since this allows the dynamic addition to Mesos masters.
Expand Down
39 changes: 27 additions & 12 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,14 @@ func main() {
go func() { errch <- <-res.LaunchHTTP() }()
}

changed := make(chan []string, 1)
if config.Zk != "" {
logging.Verbose.Println("Starting master detector for ZK ", config.Zk)
if md, err := detector.New(config.Zk); err != nil {
log.Fatalf("failed to create master detector: %v", err)
} else if err := md.Detect(detect.NewMasters(config.Masters, changed)); err != nil {
log.Fatalf("failed to initialize master detector: %v", err)
}
} else {
changed <- config.Masters
}

changed := detectMasters(config.Zk, config.Masters)
reload := time.NewTicker(time.Second * time.Duration(config.RefreshSeconds))
zkTimeout := time.Second * time.Duration(config.ZkDetectionTimeout)
timeout := time.AfterFunc(zkTimeout, func() {
if zkTimeout > 0 {
errch <- fmt.Errorf("master detection timed out after %s", zkTimeout)
}
})

defer reload.Stop()
defer util.HandleCrash()
Expand All @@ -73,6 +68,11 @@ func main() {
case <-reload.C:
res.Reload()
case masters := <-changed:
if len(masters) == 0 || masters[0] == "" { // no leader
timeout.Reset(zkTimeout)
} else {
timeout.Stop()
}
logging.VeryVerbose.Printf("new masters detected: %v", masters)
res.SetMasters(masters)
res.Reload()
Expand All @@ -81,3 +81,18 @@ func main() {
}
}
}

func detectMasters(zk string, masters []string) <-chan []string {
changed := make(chan []string, 1)
if zk != "" {
logging.Verbose.Println("Starting master detector for ZK ", zk)
if md, err := detector.New(zk); err != nil {
log.Fatalf("failed to create master detector: %v", err)
} else if err := md.Detect(detect.NewMasters(masters, changed)); err != nil {
log.Fatalf("failed to initialize master detector: %v", err)
}
} else {
changed <- masters
}
return changed
}
43 changes: 24 additions & 19 deletions records/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ type Config struct {
// Timeout is the default connect/read/write timeout for outbound
// queries
Timeout int
// Zookeeper Detection Timeout: how long in seconds to wait for Zookeeper to
// be initially responsive. Default is 30 and 0 means no timeout.
ZkDetectionTimeout int
// NOTE(tsenart): HTTPPort, DNSOn and HTTPOn have defined JSON keys for
// backwards compatibility with external API clients.
HTTPPort int `json:"HttpPort"`
Expand Down Expand Up @@ -64,25 +67,26 @@ type Config struct {
// NewConfig return the default config of the resolver
func NewConfig() Config {
return Config{
RefreshSeconds: 60,
TTL: 60,
Domain: "mesos",
Port: 53,
Timeout: 5,
SOARname: "root.ns1.mesos",
SOAMname: "ns1.mesos",
SOARefresh: 60,
SOARetry: 600,
SOAExpire: 86400,
SOAMinttl: 60,
Resolvers: []string{"8.8.8.8"},
Listener: "0.0.0.0",
HTTPPort: 8123,
DNSOn: true,
HTTPOn: true,
ExternalOn: true,
RecurseOn: true,
IPSources: []string{"netinfo", "mesos", "host"},
ZkDetectionTimeout: 30,
RefreshSeconds: 60,
TTL: 60,
Domain: "mesos",
Port: 53,
Timeout: 5,
SOARname: "root.ns1.mesos",
SOAMname: "ns1.mesos",
SOARefresh: 60,
SOARetry: 600,
SOAExpire: 86400,
SOAMinttl: 60,
Resolvers: []string{"8.8.8.8"},
Listener: "0.0.0.0",
HTTPPort: 8123,
DNSOn: true,
HTTPOn: true,
ExternalOn: true,
RecurseOn: true,
IPSources: []string{"netinfo", "mesos", "host"},
}
}

Expand Down Expand Up @@ -126,6 +130,7 @@ func SetConfig(cjson string) Config {
logging.Verbose.Println("Mesos-DNS configuration:")
logging.Verbose.Println(" - Masters: " + strings.Join(c.Masters, ", "))
logging.Verbose.Println(" - Zookeeper: ", c.Zk)
logging.Verbose.Println(" - ZookeeperDetectionTimeout: ", c.ZkDetectionTimeout)
logging.Verbose.Println(" - RefreshSeconds: ", c.RefreshSeconds)
logging.Verbose.Println(" - Domain: " + c.Domain)
logging.Verbose.Println(" - Listener: " + c.Listener)
Expand Down

0 comments on commit c662855

Please sign in to comment.