在 R 中从 REST API 检索数据时如何解决分页问题
How to resolve paging when retrieving data from REST API in R
我正在使用 REST API 使用以下代码从 Azure Table 检索数据:
library(httr)
library(RCurl)
library(bitops)
library(xml2)
# Stores credentials in variable
Account <- "storageaccount"
Container <- "Usage"
Key <- "key"
# Composes URL
URL <- paste0(
"https://",
Account,
".table.core.windows.net",
"/",
Container
)
# Requests time stamp
requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")
# As per Microsoft's specs, an empty line is needed for content-length
content_lenght <- 0
# Composes signature string
signature_string <- paste0(
"GET", "\n", # HTTP Verb
"\n", # Content-MD-5
"text/xml", "\n", # Content-Type
requestdate, "\n", # Date
"/", Account, "/", Container # Canonicalized resource
)
# Composes header string
header_string <- add_headers(
Authorization=paste0(
"SharedKey ",
Account,
":",
RCurl::base64(
digest::hmac(
key = RCurl::base64Decode(
Key, mode = "raw"
),
object = enc2utf8(signature_string),
algo = "sha256",
raw = TRUE
)
)
),
'x-ms-date' = requestdate,
'x-ms-version' = "2020-12-06",
'Content-type' = "text/xml"
)
# Creates request
xml_body = content(
GET(
URL,
config = header_string,
verbose()
),
"text"
)
Get_data <- xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_data, flatten = TRUE) # Parses text from JSON
Table_name <- as.data.frame(From_JSON) # Saves data to a table
我现在可以查看 table,但我注意到我只能看到前 1000 行。实现检索所有剩余行并更新 table 的 loop/cycle 的最有效方法是什么?
我需要能够处理整个数据集。
还要考虑到此 table 每天将更新约 40,000 行,因此保持视觉效果与数据同步是一个问题。
提前感谢您的建议!
~外星人
不确定如何在 R
中具体实施,但这是一般方法:
当您列出 table 中的实体时,单个请求中最多 return 编辑 1000 个实体。如果 table 包含超过 1000 个实体,Table 服务将 return 两个额外的 headers:x-ms-continuation-NextPartitionKey
和 x-ms-continuation-NextRowKey
。这两个 headers 的存在表明有更多数据可供您获取。
您需要做的是使用这些 headers 并在下一个请求 URL 中指定两个查询参数:NextPartitionKey
和 NextRowKey
。所以你的请求应该是这样的:
https://account.table.core.windows.net/Table?NextPartitionKey=<x-ms-continuation-NextPartitionKey header value>&NextRowKey=<x-ms-continuation-NextRowKey header value>
.
您需要重复该过程,直到您在响应中没有得到这些 headers。
您可以在此处了解更多信息:https://docs.microsoft.com/en-us/rest/api/storageservices/query-timeout-and-pagination。
感谢指点!我已经整理了一些代码...不幸的是,我没有通过第一个循环(检索第 3 页)而且我不太明白为什么。
我通过做出一些假设来编写它,例如返回的元数据始终具有相同的结构。
这是代码:
library(httr)
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)
# Retrieves metadata
Get_headers <- capture.output(
content(
GET(
URL,
config = header_string,
verbose()
)
),
type = "message")
Server_response <- Get_headers[11] %>%
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
# Initializes variables
Pages <- 0
Next_headers_count <- 0
# Fetches data only if authentication was successful
if (Server_response = TRUE) {
Get_data <- xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_data, flatten = TRUE) # Parses text from JSON
Table_name <- as.data.frame(From_JSON) # Saves data to a table
Pages <- Pages + 1 # One page of data has been retrieved
# Checks if there are more than 1000 rows to be fetched
x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextPartitionKey") %>%
grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
x_ms_continuation_NextRowKey <- Get_headers[20] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextRowKey") %>%
grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
# Starts loop to retrieve additional data
while (x_ms_continuation_NextPartitionKey = TRUE &
x_ms_continuation_NextRowKey = TRUE) {
Pages <- Pages + 1 # Counts the number of pages retrieved, including the initial page
Next_headers_count <- Next_headers_count +1 # Counts the number of Next headers passed by the metadata
Next_Partition_Key <- Get_headers[19] %>% # Extracts the value of the Next Partition Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_Row_key <- Get_headers[20] %>% # Extracts the value of the Next Row Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_URL <- paste0( # Creates the URL for the Next Authentication token
"https://",
Account,
".table.core.windows.net",
"/",
Container,
"?",
"NextPartitionKey=",
Next_Partition_Key,
"&NextRowKey=",
Next_Row_key
)
next_xml_body = content( # Retrieves next 1000 rows of content from table
GET(
Next_URL,
config = header_string,
verbose()
),
"text"
)
Get_new_data <- next_xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_new_data, flatten = TRUE) # Parses text from JSON
Temp_table_name <- as.data.frame(From_JSON) # Saves data to a table
Table_name <- bind_rows(Temp_table_name, Table_name) # Appends new data to the initial data
Get_new_headers <- capture.output( # Retrieves new next headers
content(
GET(
Next_URL,
config = header_string,
verbose()
)
),
type = "message")
New_server_response <- Get_new_headers[11] %>%
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
# Checks if there are more than 1000 rows to be fetched
New_x_ms_continuation_NextPartitionKey <- Get_new_headers[19] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextPartitionKey") %>%
grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
New_x_ms_continuation_NextRowKey <- Get_new_headers[20] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextRowKey") %>%
grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
x_ms_continuation_NextPartitionKey <- New_x_ms_continuation_NextPartitionKey
x_ms_continuation_NextRowKey <- New_x_ms_continuation_NextRowKey
Next_Partition_Key <- Get_new_headers[19] %>% # Extracts the value of the Next Partition Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_Row_key <- Get_new_headers[20] %>% # Extracts the value of the Next Row Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
}
} else {print("authentication failed")}
# Previews table
Pages
Next_headers_count
View(Table_name)
有了这个,我只能检索 2000 个条目。当下一个循环开始时,它失败了。好像这里失败了:
Get_new_headers <- capture.output( # Retrieves new next headers
content(
GET(
Next_URL,
config = header_string,
verbose()
)
),
type = "message")
这是错误:
如有任何帮助,我们将不胜感激!
我明白了...我改进了脚本,现在它可以运行了。 :-)
代码可以进一步完善,但这是一个工作脚本,可以在迭代中从 table 检索所有数据。
Connects to an Azure Table based on the specifications for Shared Key: https://docs.microsoft.com/en-us/rest/api/storageservices/authorize-with-shared-key
library(httr)
library(RCurl)
library(bitops)
library(xml2)
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)
# Stores credentials in variable
Account <- "storage"
Container <- "Usage"
Key <- "key"
# Composes URL
URL <- paste0(
"https://",
Account,
".table.core.windows.net",
"/",
Container
)
# Requests time stamp
requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")
# As per Microsoft's specs, an empty line is needed for content-length
content_lenght <- 0
# Composes signature string
signature_string <- paste0(
"GET", "\n", # HTTP Verb
"\n", # Content-MD-5
"text/xml", "\n", # Content-Type
requestdate, "\n", # Date
"/", Account, "/", Container # Canonicalized resource
)
# Composes header string
header_string <- add_headers(
Authorization=paste0(
"SharedKey ",
Account,
":",
RCurl::base64(
digest::hmac(
key = RCurl::base64Decode(
Key, mode = "raw"
),
object = enc2utf8(signature_string),
algo = "sha256",
raw = TRUE
)
)
),
'x-ms-date' = requestdate,
'x-ms-version' = "2020-12-06",
'Content-type' = "text/xml"
)
# Calls
Get_headers <- capture.output( # Retrieves metadata
content(
GET(
URL,
config = header_string,
verbose()
)
),
type = "message"
)
Server_response <- Get_headers[11] %>% # Retrieves server response
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
Get_headers
Server_response
# Initializes counters
Pages <- 0
Next_headers_count <- 0
while(isTRUE(Server_response)) {
Pages <- Pages + 1
xml_body <- content( # Retrieves up to 1000 rows from the table
GET(
URL,
config = header_string,
verbose()
),
"text"
)
Get_data <- xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_data, flatten = TRUE) # Parses text from JSON
Temp_table_name <- as.data.frame(From_JSON) # Saves current rows to temp table
Table_name <- bind_rows(Temp_table_name, Table_name) # Appends new data to the initial data
# Checks if there are more than 1000 rows to be fetched
x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextPartitionKey") %>%
grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
x_ms_continuation_NextRowKey <- Get_headers[20] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextRowKey") %>%
grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
x_ms_continuation_NextPartitionKey
x_ms_continuation_NextRowKey
if (isTRUE(x_ms_continuation_NextPartitionKey) &
isTRUE(x_ms_continuation_NextRowKey)) {
Next_headers_count <- Next_headers_count + 1
Next_Partition_Key <- Get_headers[19] %>% # Extracts the value of the Next Partition Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_Row_key <- Get_headers[20] %>% # Extracts the value of the Next Row Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
URL <- paste0( # Creates the URL for the Next Authentication token
"https://",
Account,
".table.core.windows.net",
"/",
Container,
"?",
"NextPartitionKey=",
Next_Partition_Key,
"&NextRowKey=",
Next_Row_key
)
Get_headers <- capture.output( # Retrieves new metadata
content(
GET(
URL,
config = header_string,
verbose()
)
),
type = "message"
)
Server_response <- Get_headers[11] %>% # Retrieves new server response
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
}
}
Pages
Next_headers_count
View(Table_name)
我正在使用 REST API 使用以下代码从 Azure Table 检索数据:
library(httr)
library(RCurl)
library(bitops)
library(xml2)
# Stores credentials in variable
Account <- "storageaccount"
Container <- "Usage"
Key <- "key"
# Composes URL
URL <- paste0(
"https://",
Account,
".table.core.windows.net",
"/",
Container
)
# Requests time stamp
requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")
# As per Microsoft's specs, an empty line is needed for content-length
content_lenght <- 0
# Composes signature string
signature_string <- paste0(
"GET", "\n", # HTTP Verb
"\n", # Content-MD-5
"text/xml", "\n", # Content-Type
requestdate, "\n", # Date
"/", Account, "/", Container # Canonicalized resource
)
# Composes header string
header_string <- add_headers(
Authorization=paste0(
"SharedKey ",
Account,
":",
RCurl::base64(
digest::hmac(
key = RCurl::base64Decode(
Key, mode = "raw"
),
object = enc2utf8(signature_string),
algo = "sha256",
raw = TRUE
)
)
),
'x-ms-date' = requestdate,
'x-ms-version' = "2020-12-06",
'Content-type' = "text/xml"
)
# Creates request
xml_body = content(
GET(
URL,
config = header_string,
verbose()
),
"text"
)
Get_data <- xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_data, flatten = TRUE) # Parses text from JSON
Table_name <- as.data.frame(From_JSON) # Saves data to a table
我现在可以查看 table,但我注意到我只能看到前 1000 行。实现检索所有剩余行并更新 table 的 loop/cycle 的最有效方法是什么?
我需要能够处理整个数据集。
还要考虑到此 table 每天将更新约 40,000 行,因此保持视觉效果与数据同步是一个问题。
提前感谢您的建议!
~外星人
不确定如何在 R
中具体实施,但这是一般方法:
当您列出 table 中的实体时,单个请求中最多 return 编辑 1000 个实体。如果 table 包含超过 1000 个实体,Table 服务将 return 两个额外的 headers:x-ms-continuation-NextPartitionKey
和 x-ms-continuation-NextRowKey
。这两个 headers 的存在表明有更多数据可供您获取。
您需要做的是使用这些 headers 并在下一个请求 URL 中指定两个查询参数:NextPartitionKey
和 NextRowKey
。所以你的请求应该是这样的:
https://account.table.core.windows.net/Table?NextPartitionKey=<x-ms-continuation-NextPartitionKey header value>&NextRowKey=<x-ms-continuation-NextRowKey header value>
.
您需要重复该过程,直到您在响应中没有得到这些 headers。
您可以在此处了解更多信息:https://docs.microsoft.com/en-us/rest/api/storageservices/query-timeout-and-pagination。
感谢指点!我已经整理了一些代码...不幸的是,我没有通过第一个循环(检索第 3 页)而且我不太明白为什么。
我通过做出一些假设来编写它,例如返回的元数据始终具有相同的结构。
这是代码:
library(httr)
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)
# Retrieves metadata
Get_headers <- capture.output(
content(
GET(
URL,
config = header_string,
verbose()
)
),
type = "message")
Server_response <- Get_headers[11] %>%
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
# Initializes variables
Pages <- 0
Next_headers_count <- 0
# Fetches data only if authentication was successful
if (Server_response = TRUE) {
Get_data <- xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_data, flatten = TRUE) # Parses text from JSON
Table_name <- as.data.frame(From_JSON) # Saves data to a table
Pages <- Pages + 1 # One page of data has been retrieved
# Checks if there are more than 1000 rows to be fetched
x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextPartitionKey") %>%
grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
x_ms_continuation_NextRowKey <- Get_headers[20] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextRowKey") %>%
grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
# Starts loop to retrieve additional data
while (x_ms_continuation_NextPartitionKey = TRUE &
x_ms_continuation_NextRowKey = TRUE) {
Pages <- Pages + 1 # Counts the number of pages retrieved, including the initial page
Next_headers_count <- Next_headers_count +1 # Counts the number of Next headers passed by the metadata
Next_Partition_Key <- Get_headers[19] %>% # Extracts the value of the Next Partition Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_Row_key <- Get_headers[20] %>% # Extracts the value of the Next Row Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_URL <- paste0( # Creates the URL for the Next Authentication token
"https://",
Account,
".table.core.windows.net",
"/",
Container,
"?",
"NextPartitionKey=",
Next_Partition_Key,
"&NextRowKey=",
Next_Row_key
)
next_xml_body = content( # Retrieves next 1000 rows of content from table
GET(
Next_URL,
config = header_string,
verbose()
),
"text"
)
Get_new_data <- next_xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_new_data, flatten = TRUE) # Parses text from JSON
Temp_table_name <- as.data.frame(From_JSON) # Saves data to a table
Table_name <- bind_rows(Temp_table_name, Table_name) # Appends new data to the initial data
Get_new_headers <- capture.output( # Retrieves new next headers
content(
GET(
Next_URL,
config = header_string,
verbose()
)
),
type = "message")
New_server_response <- Get_new_headers[11] %>%
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
# Checks if there are more than 1000 rows to be fetched
New_x_ms_continuation_NextPartitionKey <- Get_new_headers[19] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextPartitionKey") %>%
grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
New_x_ms_continuation_NextRowKey <- Get_new_headers[20] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextRowKey") %>%
grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
x_ms_continuation_NextPartitionKey <- New_x_ms_continuation_NextPartitionKey
x_ms_continuation_NextRowKey <- New_x_ms_continuation_NextRowKey
Next_Partition_Key <- Get_new_headers[19] %>% # Extracts the value of the Next Partition Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_Row_key <- Get_new_headers[20] %>% # Extracts the value of the Next Row Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
}
} else {print("authentication failed")}
# Previews table
Pages
Next_headers_count
View(Table_name)
有了这个,我只能检索 2000 个条目。当下一个循环开始时,它失败了。好像这里失败了:
Get_new_headers <- capture.output( # Retrieves new next headers
content(
GET(
Next_URL,
config = header_string,
verbose()
)
),
type = "message")
这是错误:
如有任何帮助,我们将不胜感激!
我明白了...我改进了脚本,现在它可以运行了。 :-)
代码可以进一步完善,但这是一个工作脚本,可以在迭代中从 table 检索所有数据。
Connects to an Azure Table based on the specifications for Shared Key: https://docs.microsoft.com/en-us/rest/api/storageservices/authorize-with-shared-key
library(httr)
library(RCurl)
library(bitops)
library(xml2)
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)
# Stores credentials in variable
Account <- "storage"
Container <- "Usage"
Key <- "key"
# Composes URL
URL <- paste0(
"https://",
Account,
".table.core.windows.net",
"/",
Container
)
# Requests time stamp
requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")
# As per Microsoft's specs, an empty line is needed for content-length
content_lenght <- 0
# Composes signature string
signature_string <- paste0(
"GET", "\n", # HTTP Verb
"\n", # Content-MD-5
"text/xml", "\n", # Content-Type
requestdate, "\n", # Date
"/", Account, "/", Container # Canonicalized resource
)
# Composes header string
header_string <- add_headers(
Authorization=paste0(
"SharedKey ",
Account,
":",
RCurl::base64(
digest::hmac(
key = RCurl::base64Decode(
Key, mode = "raw"
),
object = enc2utf8(signature_string),
algo = "sha256",
raw = TRUE
)
)
),
'x-ms-date' = requestdate,
'x-ms-version' = "2020-12-06",
'Content-type' = "text/xml"
)
# Calls
Get_headers <- capture.output( # Retrieves metadata
content(
GET(
URL,
config = header_string,
verbose()
)
),
type = "message"
)
Server_response <- Get_headers[11] %>% # Retrieves server response
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
Get_headers
Server_response
# Initializes counters
Pages <- 0
Next_headers_count <- 0
while(isTRUE(Server_response)) {
Pages <- Pages + 1
xml_body <- content( # Retrieves up to 1000 rows from the table
GET(
URL,
config = header_string,
verbose()
),
"text"
)
Get_data <- xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_data, flatten = TRUE) # Parses text from JSON
Temp_table_name <- as.data.frame(From_JSON) # Saves current rows to temp table
Table_name <- bind_rows(Temp_table_name, Table_name) # Appends new data to the initial data
# Checks if there are more than 1000 rows to be fetched
x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextPartitionKey") %>%
grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
x_ms_continuation_NextRowKey <- Get_headers[20] %>%
trimws( whitespace = "<- ") %>%
gsub("\.*", "x-ms-continuation-NextRowKey") %>%
grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
x_ms_continuation_NextPartitionKey
x_ms_continuation_NextRowKey
if (isTRUE(x_ms_continuation_NextPartitionKey) &
isTRUE(x_ms_continuation_NextRowKey)) {
Next_headers_count <- Next_headers_count + 1
Next_Partition_Key <- Get_headers[19] %>% # Extracts the value of the Next Partition Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_Row_key <- Get_headers[20] %>% # Extracts the value of the Next Row Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
URL <- paste0( # Creates the URL for the Next Authentication token
"https://",
Account,
".table.core.windows.net",
"/",
Container,
"?",
"NextPartitionKey=",
Next_Partition_Key,
"&NextRowKey=",
Next_Row_key
)
Get_headers <- capture.output( # Retrieves new metadata
content(
GET(
URL,
config = header_string,
verbose()
)
),
type = "message"
)
Server_response <- Get_headers[11] %>% # Retrieves new server response
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
}
}
Pages
Next_headers_count
View(Table_name)