查询 Druid SQL 内部连接，数据源名称带有破折号

Question

如何在两个数据源之间编写 INNER JOIN 查询，其中一个数据源的模式名称带有破折号

在 Druid SQL 二进制上执行以下查询导致查询错误

SELECT * 
FROM first 
INNER JOIN "second-schema" on first.device_id = "second-schema".device_id;

org.apache.druid.java.util.common.ISE: Cannot build plan for query

当尝试引用名称中有破折号的数据源时，这是正确的语法吗？

架构

[
  {
    "dataSchema": {
      "dataSource": "second-schema",
      "parser": {
        "type": "string",
        "parseSpec": {
          "format": "json",
          "timestampSpec": {
            "column": "ts_start"
          },
          "dimensionsSpec": {
            "dimensions": [
              "etid",
              "device_id",
              "device_name",
              "x_1",
              "x_2",
              "x_3",
              "vlan",
              "s_x",
              "d_x",
              "d_p",
              "msg_type"
            ],
            "dimensionExclusions": [],
            "spatialDimensions": []
          }
        }
      },
      "metricsSpec": [
        { "type": "hyperUnique", "name": "conn_id_hll", "fieldName": "conn_id"},
        {
          "type": "count",
          "name": "event_count"
        }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "HOUR",
        "queryGranularity": "minute"
      }
    },
    "ioConfig": {
      "type": "realtime",
      "firehose": {
        "type": "kafka-0.8",
        "consumerProps": {
          "zookeeper.connect": "localhost:2181",
          "zookeeper.connectiontimeout.ms": "15000",
          "zookeeper.sessiontimeout.ms": "15000",
          "zookeeper.synctime.ms": "5000",
          "group.id": "flow-info",
          "fetch.size": "1048586",
          "autooffset.reset": "largest",
          "autocommit.enable": "false"
        },
        "feed": "flow-info"
      },
      "plumber": {
        "type": "realtime"
      }
    },
    "tuningConfig": {
      "type": "realtime",
      "maxRowsInMemory": 50000,
      "basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
      "intermediatePersistPeriod": "PT10m",
      "windowPeriod": "PT15m",
      "rejectionPolicy": {
        "type": "serverTime"
      }
    }
  },
  {
    "dataSchema": {
      "dataSource": "first",
      "parser": {
        "type": "string",
        "parseSpec": {
          "format": "json",
          "timestampSpec": {
            "column": "ts_start"
          },
          "dimensionsSpec": {
            "dimensions": [
              "etid",
              "category",
              "device_id",
              "device_name",
              "severity",
              "x_2",
              "x_3",
              "x_4",
              "x_5",
              "vlan",
              "s_x",
              "d_x",
              "s_i",
              "d_i",
              "d_p",
              "id"
            ],
            "dimensionExclusions": [],
            "spatialDimensions": []
          }
        }
      },
      "metricsSpec": [
        { "type": "doubleSum",  "name": "val_num",      "fieldName": "val_num" },
        { "type": "doubleMin",  "name": "val_num_min",  "fieldName": "val_num" },
        { "type": "doubleMax",  "name": "val_num_max",  "fieldName": "val_num" },
        { "type": "doubleSum",  "name": "size",         "fieldName": "size" },
        { "type": "doubleMin",  "name": "size_min",     "fieldName": "size" },
        { "type": "doubleMax",  "name": "size_max",     "fieldName": "size" },
        { "type": "count", "name": "first_count" }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "HOUR",
        "queryGranularity": "minute"
      }
    },
    "ioConfig": {
      "type": "realtime",
      "firehose": {
        "type": "kafka-0.8",
        "consumerProps": {
          "zookeeper.connect": "localhost:2181",
          "zookeeper.connectiontimeout.ms": "15000",
          "zookeeper.sessiontimeout.ms": "15000",
          "zookeeper.synctime.ms": "5000",
          "group.id": "first",
          "fetch.size": "1048586",
          "autooffset.reset": "largest",
          "autocommit.enable": "false"
        },
        "feed": "first"
      },
      "plumber": {
        "type": "realtime"
      }
    },
    "tuningConfig": {
      "type": "realtime",
      "maxRowsInMemory": 50000,
      "basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
      "intermediatePersistPeriod": "PT10m",
      "windowPeriod": "PT15m",
      "rejectionPolicy": {
        "type": "serverTime"
      }
    }
  }
]

Answer 1

根据您的架构定义，我将进行一些观察。

进行联接时，您通常必须明确列出列（不使用 *），否则您会因重复列而发生冲突。例如，在您的联接中，您在“first”和“second-schema”中都有一个 device_id，更不用说所有其他列都相同了。
当使用文字定界符时，我不会混淆它们。我要么用要么不用。

所以我认为您的查询以类似这样的形式会更好

 SELECT
    "first"."etid",
    "first"."category",
    "first"."device_id",
    "first"."device_name",
    "first"."severity",
    "first"."x_2",
    "first"."x_3",
    "first"."x_4",
    "first"."x_5",
    "first"."vlan",
    "first"."s_x",
    "first"."d_x",
    "first"."s_i",
    "first"."d_i",
    "first"."d_p",
    "first"."id",
    "second-schema"."etid" as "ss_etid",
    "second-schema"."device_id" as "ss_device_id",
    "second-schema"."device_name" as "ss_device_name",
    "second-schema"."x_1" as "ss_x_1",
    "second-schema"."x_2" as "ss_x_2",
    "second-schema"."x_3" as "ss_x_3",
    "second-schema"."vlan" as "ss_vlan",
    "second-schema"."s_x" as "ss_s_x",
    "second-schema"."d_x" as "ss_d_x",
    "second-schema"."d_p" as "ss_d_p",
    "second-schema"."msg_type"
 FROM "first"
 INNER JOIN "second-schema" ON "first"."device_id" = "second-schema"."device_id";

显然，您可以随意命名您认为合适的列，或根据需要包括排除列。 Select * 仅当两个表中的所有列都唯一时才有效。

查询 Druid SQL 内部连接，数据源名称带有破折号

Query Druid SQL inner join with a dataSource name that has a dash

sql

inner-join

druid