Solr Cloud 无法连接到(随机)Zookeeper 节点(完整 docker 设置)

Solr Cloud cannot connect to (random) Zookeeper node (full docker set-up)

简介

我正在尝试使用 Zookeeper 来尝试 SolrCloud。 我知道 SolrCloud 有自己的内置 Zookeeper,但由于不推荐使用该设置,我模仿(或者至少我希望如此)外部 Zookeeper 集合 - Solr Cloud 设置(3 个 ZK 节点,2 个 Solr节点)。

为此,我创建了以下内容 docker-compose.yml

version: '3.8'

services:
  zoo1:
    image: library/zookeeper:3.5.7
    container_name: zoo1
    restart: always
    hostname: zoo1
    ports:
      - 8184:8080
    environment:
      TZ: Europe/Paris
      ZOO_MY_ID: 1
      ZOO_SERVERS: server.1=0.0.0.0:2888:3888;2181 server.2=zoo2:2888:3888;2181 server.3=zoo3:2888:3888;2181
    networks:
      - solr
    command: >
      sh -c "ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && 
      echo $TZ > /etc/timezone &&
      sed -i 's/autopurge.purgeInterval=0/autopurge.purgeInterval=1/g' /conf/zoo.cfg  &&
      echo 4lw.commands.whitelist=mntr,conf,ruok >> /conf/zoo.cfg &&
      exec zkServer.sh start-foreground"

  zoo2:
    image: library/zookeeper:3.5.7
    container_name: zoo2
    restart: always
    hostname: zoo2
    ports:
      - 8284:8080
    environment:
      TZ: Europe/Paris
      ZOO_MY_ID: 2
      ZOO_SERVERS: server.1=zoo1:2888:3888;2181 server.2=0.0.0.0:2888:3888;2181 server.3=zoo3:2888:3888;2181
    networks:
      - solr
    command: >
      sh -c "ln -snf /usr/share/zoneinfo/$TZ /etc/localtime &&
      echo $TZ > /etc/timezone &&
      sed -i 's/autopurge.purgeInterval=0/autopurge.purgeInterval=1/g' /conf/zoo.cfg  &&
      echo 4lw.commands.whitelist=mntr,conf,ruok >> /conf/zoo.cfg &&
      exec zkServer.sh start-foreground"

  zoo3:
    image: library/zookeeper:3.5.7
    container_name: zoo3
    restart: always
    hostname: zoo3
    ports:
      - 8384:8080
    environment:
      TZ: Europe/Paris
      ZOO_MY_ID: 3
      ZOO_SERVERS: server.1=zoo1:2888:3888;2181 server.2=zoo2:2888:3888;2181 server.3=0.0.0.0:2888:3888;2181
    networks:
      - solr
    command: >
      sh -c "ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && 
      echo $TZ > /etc/timezone &&
      sed -i 's/autopurge.purgeInterval=0/autopurge.purgeInterval=1/g' /conf/zoo.cfg  &&
      echo 4lw.commands.whitelist=mntr,conf,ruok >> /conf/zoo.cfg &&
      exec zkServer.sh start-foreground"

  solr1:
    image: library/solr:8.6.3
    container_name: solr1
    ports:
      - "8981:8983"
    environment:
       ZK_HOST: zoo1:2181,zoo2:2181,zoo3:2181
    networks:
      - solr
    depends_on:
      - zoo1
      - zoo2
      - zoo3

  solr2:
    image: library/solr:8.6.3
    container_name: solr2
    ports:
      - "8982:8983"
    environment:
      ZK_HOST: zoo1:2181,zoo2:2181,zoo3:2181
    networks:
      - solr
    depends_on:
      - zoo1
      - zoo2
      - zoo3

networks:
  solr:
    name: solr_zookeeper_cluster

所以,使用这个文件,一切都开始起来很容易。 我其实有3个ZK节点,其中一个是leader,还有2个Solr节点...

问题

但是(这是我的实际问题)Solr UI 在显示 ZK 状态时表现得有点奇怪。 我总是在 zkStatus 中恰好有 2 个 ZK 实例没有问题,但恰好有一个“不正常”... 大多数时候,两个 Solr 节点都有同一个 Zookeeper 节点的问题,但是一旦我开始玩弄(如:停止领导者以触发领导者选举并重新启动该特定节点),它就会变得非常随机...... .

初始启动后的屏幕截图:

触发leader选举后的截图

部分节点日志

2020-10-14 09:31:18.597 INFO  (main) [   ] o.e.j.s.Server Started @7571ms
2020-10-14 09:32:20.539 INFO  (qtp247162961-18) [   ] o.a.s.c.TransientSolrCoreCacheDefault Allocating transient cache for 2147483647 transient cores
2020-10-14 09:32:20.540 INFO  (qtp247162961-18) [   ] o.a.s.s.HttpSolrCall [admin] webapp=null path=/admin/cores params={indexInfo=false&wt=json&_=1602667940461} status=0 QTime=6
2020-10-14 09:32:20.552 WARN  (qtp247162961-17) [   ] o.a.s.h.a.ZookeeperStatusHandler Failed talking to zookeeper 0.0.0.0:2181 => org.apache.solr.common.SolrException: Failed talking to Zookeeper 0.0.0.0:2181
        at org.apache.solr.handler.admin.ZookeeperStatusHandler.getZkRawResponse(ZookeeperStatusHandler.java:294)
org.apache.solr.common.SolrException: Failed talking to Zookeeper 0.0.0.0:2181
        at org.apache.solr.handler.admin.ZookeeperStatusHandler.getZkRawResponse(ZookeeperStatusHandler.java:294) ~[?:?]
        at org.apache.solr.handler.admin.ZookeeperStatusHandler.monitorZookeeper(ZookeeperStatusHandler.java:238) ~[?:?]
        at org.apache.solr.handler.admin.ZookeeperStatusHandler.getZkStatus(ZookeeperStatusHandler.java:144) ~[?:?]
        at org.apache.solr.handler.admin.ZookeeperStatusHandler.handleRequestBody(ZookeeperStatusHandler.java:84) ~[?:?]
        at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:214) ~[?:?]
        at org.apache.solr.servlet.HttpSolrCall.handleAdmin(HttpSolrCall.java:857) ~[?:?]
        at org.apache.solr.servlet.HttpSolrCall.handleAdminRequest(HttpSolrCall.java:821) ~[?:?]
        at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:566) ~[?:?]
        at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:415) ~[?:?]
        at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:345) ~[?:?]
        at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1596) ~[jetty-servlet-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:545) ~[jetty-servlet-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:590) ~[jetty-security-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1610) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1300) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:485) ~[jetty-servlet-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1580) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1215) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:221) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.InetAccessHandler.handle(InetAccessHandler.java:177) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.rewrite.handler.RewriteHandler.handle(RewriteHandler.java:322) ~[jetty-rewrite-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.Server.handle(Server.java:500) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.HttpChannel.lambda$handle(HttpChannel.java:383) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:547) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:375) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:273) ~[jetty-server-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311) ~[jetty-io-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103) ~[jetty-io-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.io.ChannelEndPoint.run(ChannelEndPoint.java:117) ~[jetty-io-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336) ~[jetty-util-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313) ~[jetty-util-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171) ~[jetty-util-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129) ~[jetty-util-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:375) ~[jetty-util-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:806) ~[jetty-util-9.4.27.v20200227.jar:9.4.27.v20200227]
        at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:938) ~[jetty-util-9.4.27.v20200227.jar:9.4.27.v20200227]
        at java.lang.Thread.run(Unknown Source) [?:?]
Caused by: java.net.ConnectException: Connection refused (Connection refused)
        at java.net.PlainSocketImpl.socketConnect(Native Method) ~[?:?]
        at java.net.AbstractPlainSocketImpl.doConnect(Unknown Source) ~[?:?]
        at java.net.AbstractPlainSocketImpl.connectToAddress(Unknown Source) ~[?:?]
        at java.net.AbstractPlainSocketImpl.connect(Unknown Source) ~[?:?]
        at java.net.SocksSocketImpl.connect(Unknown Source) ~[?:?]
        at java.net.Socket.connect(Unknown Source) ~[?:?]
        at java.net.Socket.connect(Unknown Source) ~[?:?]
        at java.net.Socket.<init>(Unknown Source) ~[?:?]
        at java.net.Socket.<init>(Unknown Source) ~[?:?]
        at org.apache.solr.handler.admin.ZookeeperStatusHandler.getZkRawResponse(ZookeeperStatusHandler.java:285) ~[?:?]
        ... 46 more
2020-10-14 09:32:20.564 INFO  (qtp247162961-22) [   ] o.a.s.s.HttpSolrCall [admin] webapp=null path=/admin/info/system params={wt=json&_=1602667940462} status=0 QTime=29
2020-10-14 09:32:20.573 INFO  (qtp247162961-17) [   ] o.a.s.s.HttpSolrCall [admin] webapp=null path=/admin/zookeeper/status params={wt=json&_=1602667940521} status=0 QTime=39
2020-10-14 09:32:20.589 INFO  (qtp247162961-20) [   ] o.a.s.h.a.CollectionsHandler Invoked Collection Action :list with params action=LIST&wt=json&_=1602667940462 and sendToOCPQueue=true
2020-10-14 09:32:20.589 INFO  (qtp247162961-20) [   ] o.a.s.s.HttpSolrCall [admin] webapp=null path=/admin/collections params={action=LIST&wt=json&_=1602667940462} status=0 QTime=0
2020-10-14 09:32:20.612 INFO  (qtp247162961-18) [   ] o.a.s.h.a.CollectionsHandler Invoked Collection Action :listaliases with params action=LISTALIASES&wt=json&_=1602667940462 and sendToOCPQueue=true
2020-10-14 09:32:20.615 INFO  (qtp247162961-18) [   ] o.a.s.s.HttpSolrCall [admin] webapp=null path=/admin/collections params={action=LISTALIASES&wt=json&_=1602667940462} status=0 QTime=2

您不应使用 0.0.0.0,而应使用通过 dockerfile 定义的主机名。所以在 zoo1 配置 server1 应该是 zoo1,在 zoo2 server2 应该是 zoo2,在 zoo3 server 应该是 zoo3.