如何查询每个国家/地区最常用语言的 GHTorrent(SQL 类语言)
How to query GHTorrent's (SQL-like language) for most common languages per country
基于这个问题How to query GHTorrent's (SQL-like language) for country/city/users number/repositories number? and first query here https://ghtorrent.org/gcloud.html, I am trying to get an sql query to get the most common coding language per country and ideally per month/year from the GHtorrent bigquery database. I have tried to edit this answer code ,但未能获得正确的连接。我理想的结果应该是这样的
country
Year
Month
Language
Number of commits
total_bytes
US
2016
Jan
Python
10000
46789390
CH
2016
Jan
Java
20000
5679304
基本上,我不太擅长创建 SQL 查询。
我检查了您传递的查询的两个示例,然后我找到了 project_id
的共同值,我修改了第二个示例以带来 project_id
和 created_date
的提交。然后我决定像你提到的那样格式化 created_date
以带来年份和月份并将其添加为过滤器。
然后我将两个示例加入 CTE
并且我只 SELECT
需要的列的名称。
最后我用了一个ROW_NUMBER
只把每种语言处理过的字节数的最大值带country/year/month.
WITH ltb as(
select pl3.lang, sum(pl3.size) as total_bytes, pl3.project_id
from (
select pl2.bytes as size, pl2.language as lang, pl2.project_id
from (
select pl.language as lang, max(pl.created_at) as latest, pl.project_id as project_id
from `ghtorrent-bq.ght.project_languages` pl
join `ghtorrent-bq.ght.projects` p on p.id = pl.project_id
where p.deleted is false
and p.forked_from is null
group by lang, project_id
) pl1 join `ghtorrent-bq.ght.project_languages` pl2 on pl1.project_id = pl2.project_id
and pl1.latest = pl2.created_at
and pl1.lang = pl2.language
) pl3
group by pl3.lang, pl3.project_id
order by total_bytes desc
), fprt as(
SELECT country_code, count(*) AS NoOfCommits, c.project_id,
FORMAT_TIMESTAMP("%m", c.created_at)
AS formattedmonth,FORMAT_TIMESTAMP("%b", c.created_at)
AS formattedmonthname, FORMAT_TIMESTAMP("%Y", c.created_at)
AS formattedyear,
FROM `ghtorrent-bq.ght.commits` AS c
JOIN `ghtorrent-bq.ght.users` AS u
ON c.Committer_Id = u.id
WHERE NOT u.fake and country_code is not null
GROUP BY country_code, c.project_id, formattedmonth, formattedyear, formattedmonthname
ORDER BY NoOfCommits DESC
), almst as(
SELECT country_code,formattedmonth, formattedmonthname, formattedyear, lang, NoOfCommits, total_bytes FROM fprt JOIN ltb
on ltb.project_id=fprt.project_id
where country_code is not null
)
SELECT country_code, formattedyear as year, formattedmonthname as month, lang, NoOfCommits, total_bytes
FROM
(
SELECT *, ROW_NUMBER() OVER (PARTITION BY country_code, formattedyear, formattedmonth ORDER BY total_bytes DESC) rn
FROM almst
) t
WHERE rn = 1
ORDER BY formattedyear asc, formattedmonth asc
输出:
基于这个问题How to query GHTorrent's (SQL-like language) for country/city/users number/repositories number? and first query here https://ghtorrent.org/gcloud.html, I am trying to get an sql query to get the most common coding language per country and ideally per month/year from the GHtorrent bigquery database. I have tried to edit this answer code ,但未能获得正确的连接。我理想的结果应该是这样的
country | Year | Month | Language | Number of commits | total_bytes |
---|---|---|---|---|---|
US | 2016 | Jan | Python | 10000 | 46789390 |
CH | 2016 | Jan | Java | 20000 | 5679304 |
基本上,我不太擅长创建 SQL 查询。
我检查了您传递的查询的两个示例,然后我找到了 project_id
的共同值,我修改了第二个示例以带来 project_id
和 created_date
的提交。然后我决定像你提到的那样格式化 created_date
以带来年份和月份并将其添加为过滤器。
然后我将两个示例加入 CTE
并且我只 SELECT
需要的列的名称。
最后我用了一个ROW_NUMBER
只把每种语言处理过的字节数的最大值带country/year/month.
WITH ltb as(
select pl3.lang, sum(pl3.size) as total_bytes, pl3.project_id
from (
select pl2.bytes as size, pl2.language as lang, pl2.project_id
from (
select pl.language as lang, max(pl.created_at) as latest, pl.project_id as project_id
from `ghtorrent-bq.ght.project_languages` pl
join `ghtorrent-bq.ght.projects` p on p.id = pl.project_id
where p.deleted is false
and p.forked_from is null
group by lang, project_id
) pl1 join `ghtorrent-bq.ght.project_languages` pl2 on pl1.project_id = pl2.project_id
and pl1.latest = pl2.created_at
and pl1.lang = pl2.language
) pl3
group by pl3.lang, pl3.project_id
order by total_bytes desc
), fprt as(
SELECT country_code, count(*) AS NoOfCommits, c.project_id,
FORMAT_TIMESTAMP("%m", c.created_at)
AS formattedmonth,FORMAT_TIMESTAMP("%b", c.created_at)
AS formattedmonthname, FORMAT_TIMESTAMP("%Y", c.created_at)
AS formattedyear,
FROM `ghtorrent-bq.ght.commits` AS c
JOIN `ghtorrent-bq.ght.users` AS u
ON c.Committer_Id = u.id
WHERE NOT u.fake and country_code is not null
GROUP BY country_code, c.project_id, formattedmonth, formattedyear, formattedmonthname
ORDER BY NoOfCommits DESC
), almst as(
SELECT country_code,formattedmonth, formattedmonthname, formattedyear, lang, NoOfCommits, total_bytes FROM fprt JOIN ltb
on ltb.project_id=fprt.project_id
where country_code is not null
)
SELECT country_code, formattedyear as year, formattedmonthname as month, lang, NoOfCommits, total_bytes
FROM
(
SELECT *, ROW_NUMBER() OVER (PARTITION BY country_code, formattedyear, formattedmonth ORDER BY total_bytes DESC) rn
FROM almst
) t
WHERE rn = 1
ORDER BY formattedyear asc, formattedmonth asc
输出: