使用 BigQuery 进行单热编码(虚拟变量)
one-hot-encoding (dummy variables) with BigQuery
我想使用 BigQuery 而不是 Pandas 为我的类别创建虚拟变量(单热编码)。
我最终会得到大约 200 列,因此我无法手动完成并对其进行硬编码
测试数据集(实际比这个多很多变量)
WITH table AS (
SELECT 1001 as ID, 'blue' As Color, 'big' AS size UNION ALL
SELECT 1002 as ID, 'yellow' As Color, 'medium' AS size UNION ALL
SELECT 1003 as ID, 'red' As Color, 'small' AS size UNION ALL
SELECT 1004 as ID, 'blue' As Color, 'small' AS size)
SELECT *
FROM table
预期结果:
以下适用于 BigQuery 标准 SQL
DECLARE Colors, Sizes ARRAY<STRING>;
SET (Colors, Sizes) = (SELECT AS STRUCT ARRAY_AGG(DISTINCT Color), ARRAY_AGG(DISTINCT Size) FROM `project.dataset.table`);
EXECUTE IMMEDIATE '''
CREATE TEMP TABLE result AS -- added line
SELECT ID, ''' || (
SELECT STRING_AGG("COUNTIF(Color = '" || Color || "') AS Color_" || Color ORDER BY Color)
FROM UNNEST(Colors) AS Color
) || (
SELECT ', ' || STRING_AGG("COUNTIF(Size = '" || Size || "') AS Size_" || Size ORDER BY Size)
FROM UNNEST(Sizes) AS Size
) || '''
FROM `project.dataset.table`
GROUP BY ID
ORDER BY ID
'''; -- added `;`
SELECT * FROM result; -- added line
如果应用于您问题中的示例数据 - 输出如下
我想使用 BigQuery 而不是 Pandas 为我的类别创建虚拟变量(单热编码)。 我最终会得到大约 200 列,因此我无法手动完成并对其进行硬编码
测试数据集(实际比这个多很多变量)
WITH table AS (
SELECT 1001 as ID, 'blue' As Color, 'big' AS size UNION ALL
SELECT 1002 as ID, 'yellow' As Color, 'medium' AS size UNION ALL
SELECT 1003 as ID, 'red' As Color, 'small' AS size UNION ALL
SELECT 1004 as ID, 'blue' As Color, 'small' AS size)
SELECT *
FROM table
预期结果:
以下适用于 BigQuery 标准 SQL
DECLARE Colors, Sizes ARRAY<STRING>;
SET (Colors, Sizes) = (SELECT AS STRUCT ARRAY_AGG(DISTINCT Color), ARRAY_AGG(DISTINCT Size) FROM `project.dataset.table`);
EXECUTE IMMEDIATE '''
CREATE TEMP TABLE result AS -- added line
SELECT ID, ''' || (
SELECT STRING_AGG("COUNTIF(Color = '" || Color || "') AS Color_" || Color ORDER BY Color)
FROM UNNEST(Colors) AS Color
) || (
SELECT ', ' || STRING_AGG("COUNTIF(Size = '" || Size || "') AS Size_" || Size ORDER BY Size)
FROM UNNEST(Sizes) AS Size
) || '''
FROM `project.dataset.table`
GROUP BY ID
ORDER BY ID
'''; -- added `;`
SELECT * FROM result; -- added line
如果应用于您问题中的示例数据 - 输出如下