Hue 访问 HDFS:绕过默认 hue.ini?

Hue access to HDFS: bypass default hue.ini?

设置

我正在尝试用 bde2020 提供的图像组成一个轻量级的最小 hadoop 堆栈(学习目的)。现在,堆栈包括(除其他外)

基本上,我是从大数据欧洲开始的official docker compose, and added a hue image based on their documentation

问题

Hue 的文件浏览器无法访问 HDFS:

Cannot access: /user/dav. The HDFS REST service is not available. Note: you are a Hue admin but not a HDFS superuser, "hdfs" or part of HDFS supergroup, "supergroup".

HTTPConnectionPool(host='namenode', port=50070): Max retries exceeded with url: /webhdfs/v1/user/dav?op=GETFILESTATUS&user.name=hue&doas=dav (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7f8119a3cf10>: Failed to establish a new connection: [Errno 111] Connection refused',))

到目前为止我为界定问题所做的努力

当我登录 hue 的容器时,我可以看到 namenode 的端口 9870 是打开的 (nmap -p 9870 namenode)。 50070 不是。我认为我的问题与网络无关。尽管编辑 hue.ini,Hue 仍然使用端口 50070。那么,如何在我当前的设置中强制 hue 使用端口 9870? (如果是这个原因)

docker-撰写

version: '3.7'

services:
  namenode:
    image: bde2020/hadoop-namenode:2.0.0-hadoop3.1.1-java8
    container_name: namenode
    hostname: namenode
    domainname: hadoop
    ports:
      - 9870:9870
    volumes:
      - hadoop_namenode:/hadoop/dfs/name
      - ./entrypoints/namenode/entrypoint.sh:/entrypoint.sh
    env_file:
      - ./hadoop.env
      - .env
    networks:
      - hadoop_net
    # TODO adduser --ingroup hadoop dav

  datanode1:
    image: bde2020/hadoop-datanode:2.0.0-hadoop3.1.1-java8
    container_name: datanode
    hostname: datanode1
    domainname: hadoop
    volumes:
      - hadoop_datanode:/hadoop/dfs/data
    environment:
      SERVICE_PRECONDITION: "namenode:9870"
    env_file:
      - ./hadoop.env
    networks:
      - hadoop_net

  resourcemanager:
    image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.1.1-java8
    container_name: resourcemanager
    environment:
      SERVICE_PRECONDITION: "namenode:9870 datanode:9864"
    env_file:
      - ./hadoop.env
    networks:
      - hadoop_net

  nodemanager1:
    image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.1.1-java8
    container_name: nodemanager
    environment:
      SERVICE_PRECONDITION: "namenode:9870 datanode:9864 resourcemanager:8088"
    env_file:
      - ./hadoop.env
    networks:
      - hadoop_net    

  historyserver:
    image: bde2020/hadoop-historyserver:2.0.0-hadoop3.1.1-java8
    container_name: historyserver
    environment:
      SERVICE_PRECONDITION: "namenode:9870 datanode:9864 resourcemanager:8088"
    volumes:
      - hadoop_historyserver:/hadoop/yarn/timeline
    env_file:
      - ./hadoop.env
    networks:
      - hadoop_net

  filebrowser: 
    container_name: hue
    image: bde2020/hdfs-filebrowser:3.11
    ports:
      - "8088:8088"
    env_file:
      - ./hadoop.env
    volumes: # BYPASS DEFAULT webhdfs url
      - ./overrides/hue/hue.ini:/opt/hue/desktop/conf.dist/hue.ini
    environment:
      - NAMENODE_HOST=namenode
    networks:
      - hadoop_net

networks:
  hadoop_net: 

volumes:
  hadoop_namenode:
  hadoop_datanode:
  hadoop_historyserver:

我能够让 Filebrowser 使用这个 INI

[desktop]
  http_host=0.0.0.0
  http_port=8888
  time_zone=America/Chicago
  dev=true
  app_blacklist=impala,zookeeper,oozie,hbase,security,search
[hadoop]
  [[hdfs_clusters]]
    [[[default]]]
      fs_defaultfs=hdfs://namenode:8020
      webhdfs_url=http://namenode:50070/webhdfs/v1
      security_enabled=false

而这个构成

version: "2"

services:
  namenode:
    image: bde2020/hadoop-namenode:1.1.0-hadoop2.7.1-java8
    container_name: namenode
    ports:
      - 8020:8020
      - 50070:50070
      # - 59050:59050
    volumes:
      - hadoop_namenode:/hadoop/dfs/name
    environment:
      - CLUSTER_NAME=test
    env_file:
      - ./hadoop.env
    networks:
      - hadoop

  datanode1:
    image: bde2020/hadoop-datanode:1.1.0-hadoop2.7.1-java8
    container_name: datanode1
    ports:
      - 50075:50075
      # - 50010:50010
      # - 50020:50020
    depends_on:
      - namenode
    volumes:
      - hadoop_datanode1:/hadoop/dfs/data
    env_file:
      - ./hadoop.env
    networks:
      - hadoop

  hue:
    image: gethue/hue
    container_name: hue
    ports:
      - 8000:8888
    depends_on:
      - namenode
    volumes:
      - ./conf/hue.ini:/hue/desktop/conf/pseudo-distributed.ini
    networks:
      - hadoop
      - frontend

    volumes:
      hadoop_namenode:
      hadoop_datanode1:

    networks:
      hadoop:
      frontend:

hadoop.env 也必须添加 hue 作为代理用户

CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root

CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*

HDFS_CONF_dfs_replication=1
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false

是的,找到了。几个关键要素:

  • 在 hadoop 3.* 中,webhdfs 不再监听 50070 但 9870 是标准端口
  • 覆盖 hue.ini 涉及挂载名为 hue-overrides.ini
  • 的文件
  • 来自 gethue 的 hue 图像比来自 bde2020 的图像更新(不过他们的 hadoop 堆栈非常棒)

Docker-compose

version: '3.7'

services:
  namenode:
    image: bde2020/hadoop-namenode:2.0.0-hadoop3.1.1-java8
    container_name: namenode
    ports:
      - 9870:9870
      - 8020:8020
    volumes:
      - hadoop_namenode:/hadoop/dfs/name
      - ./overrides/namenode/entrypoint.sh:/entrypoint.sh
    env_file:
      - ./hadoop.env
      - .env
    networks:
      - hadoop

  filebrowser: 
    container_name: hue
    image: gethue/hue:4.4.0
    ports:
      - "8000:8888"
    env_file:
      - ./hadoop.env
    volumes: # HERE
      - ./overrides/hue/hue-overrides.ini:/usr/share/hue/desktop/conf/hue-overrides.ini
    depends_on:
      - namenode
    networks:
      - hadoop
      - frontend

  datanode1:
    image: bde2020/hadoop-datanode:2.0.0-hadoop3.1.1-java8
    container_name: datanode1
    volumes:
      - hadoop_datanode:/hadoop/dfs/data
    environment:
      SERVICE_PRECONDITION: "namenode:9870"
    env_file:
      - ./hadoop.env
    networks:
      - hadoop

  resourcemanager:
    image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.1.1-java8
    container_name: resourcemanager
    environment:
      SERVICE_PRECONDITION: "namenode:9870 datanode1:9864"
    env_file:
      - ./hadoop.env
    networks:
      - hadoop

  nodemanager1:
    image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.1.1-java8
    container_name: nodemanager
    environment:
      SERVICE_PRECONDITION: "namenode:9870 datanode1:9864 resourcemanager:8088"
    env_file:
      - ./hadoop.env
    networks:
      - hadoop    

  historyserver:
    image: bde2020/hadoop-historyserver:2.0.0-hadoop3.1.1-java8
    container_name: historyserver
    environment:
      SERVICE_PRECONDITION: "namenode:9870 datanode1:9864 resourcemanager:8088"
    volumes:
      - hadoop_historyserver:/hadoop/yarn/timeline
    env_file:
      - ./hadoop.env
    networks:
      - hadoop

networks:
  hadoop: 
  frontend:

volumes:
  hadoop_namenode:
  hadoop_datanode:
  hadoop_historyserver:

hadoop.env

CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*
CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec

HDFS_CONF_dfs_replication=1
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false

hue-overrides.ini

[desktop]
  http_host=0.0.0.0
  http_port=8888
  time_zone=France
  dev=true
  app_blacklist=impala,zookeeper,oozie,hbase,security,search
[hadoop]
  [[hdfs_clusters]]
    [[[default]]]
      fs_defaultfs=hdfs://namenode:8020
      webhdfs_url=http://namenode:9870/webhdfs/v1
      security_enabled=false

谢谢@cricket_007