在 R 中从 REST API 检索数据时如何解决分页问题

How to resolve paging when retrieving data from REST API in R

我正在使用 REST API 使用以下代码从 Azure Table 检索数据:

library(httr)
library(RCurl)
library(bitops)
library(xml2)

# Stores credentials in variable

Account <- "storageaccount"
Container <- "Usage"
Key <- "key"


# Composes URL

URL <- paste0(
          "https://", 
          Account, 
          ".table.core.windows.net", 
          "/", 
          Container
        )

# Requests time stamp

requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")


# As per Microsoft's specs, an empty line is needed for content-length

content_lenght <- 0

# Composes signature string

signature_string <- paste0(
                     "GET", "\n",                 # HTTP Verb
                     "\n",                        # Content-MD-5
                     "text/xml", "\n",            # Content-Type
                     requestdate, "\n",           # Date
                     "/", Account, "/", Container # Canonicalized resource
                    )

# Composes header string

header_string <- add_headers(
                    Authorization=paste0(
                      "SharedKey ", 
                      Account, 
                      ":",
                      RCurl::base64(
                        digest::hmac(
                          key = RCurl::base64Decode(
                            Key, mode = "raw"
                          ),
                          object = enc2utf8(signature_string),
                          algo = "sha256", 
                          raw = TRUE
                            )
                        )
                    ),
                    'x-ms-date' = requestdate, 
                    'x-ms-version' = "2020-12-06",
                    'Content-type' = "text/xml"
                  )

# Creates request

xml_body = content(
                GET(
                  URL, 
                  config = header_string, 
                  verbose()
                  ),
                "text"
                )

Get_data <- xml_body                             # Gets data as text from API  
From_JSON <-fromJSON(Get_data, flatten = TRUE)   # Parses text from JSON
Table_name <- as.data.frame(From_JSON)           # Saves data to a table

我现在可以查看 table,但我注意到我只能看到前 1000 行。实现检索所有剩余行并更新 table 的 loop/cycle 的最有效方法是什么?

我需要能够处理整个数据集。

还要考虑到此 table 每天将更新约 40,000 行,因此保持视觉效果与数据同步是一个问题。

提前感谢您的建议!

~外星人

不确定如何在 R 中具体实施,但这是一般方法:

当您列出 table 中的实体时,单个请求中最多 return 编辑 1000 个实体。如果 table 包含超过 1000 个实体,Table 服务将 return 两个额外的 headers:x-ms-continuation-NextPartitionKeyx-ms-continuation-NextRowKey。这两个 headers 的存在表明有更多数据可供您获取。

您需要做的是使用这些 headers 并在下一个请求 URL 中指定两个查询参数:NextPartitionKeyNextRowKey。所以你的请求应该是这样的:

https://account.table.core.windows.net/Table?NextPartitionKey=<x-ms-continuation-NextPartitionKey header value>&NextRowKey=<x-ms-continuation-NextRowKey header value>.

您需要重复该过程,直到您在响应中没有得到这些 headers。

您可以在此处了解更多信息:https://docs.microsoft.com/en-us/rest/api/storageservices/query-timeout-and-pagination

感谢指点!我已经整理了一些代码...不幸的是,我没有通过第一个循环(检索第 3 页)而且我不太明白为什么。

我通过做出一些假设来编写它,例如返回的元数据始终具有相同的结构。

这是代码:

library(httr)
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)


# Retrieves metadata
Get_headers <- capture.output(
                 content(
                   GET(
                   URL, 
                   config = header_string, 
                   verbose()
                  )
                 ),
              type = "message")

Server_response <- Get_headers[11] %>%
                    trimws( whitespace = "\r") %>% 
                     trimws( whitespace = "<- ") %>%
                      grepl("HTTP/1.1 200 OK")


# Initializes variables
Pages <- 0
Next_headers_count <- 0

# Fetches data only if authentication was successful

if (Server_response = TRUE) {
   
   Get_data <- xml_body                             # Gets data as text from API  
   From_JSON <-fromJSON(Get_data, flatten = TRUE)   # Parses text from JSON
   Table_name <- as.data.frame(From_JSON)           # Saves data to a table
   Pages <- Pages + 1                               # One page of data has been retrieved
   
   # Checks if there are more than 1000 rows to be fetched
   
   x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%                                              
                                         trimws( whitespace = "<- ") %>%
                                            gsub("\.*", "x-ms-continuation-NextPartitionKey") %>%
                                              grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)

   x_ms_continuation_NextRowKey <- Get_headers[20] %>%
                                  trimws( whitespace = "<- ") %>% 
                                    gsub("\.*", "x-ms-continuation-NextRowKey") %>%
                                       grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
   
   # Starts loop to retrieve additional data
   
   while (x_ms_continuation_NextPartitionKey = TRUE & 
          x_ms_continuation_NextRowKey = TRUE) {
             
             Pages <- Pages + 1                                      # Counts the number of pages retrieved, including the initial page
             Next_headers_count <- Next_headers_count +1             # Counts the number of Next headers passed by the metadata
             
             Next_Partition_Key <- Get_headers[19] %>%               # Extracts the value of the Next Partition Key
                                     str_remove(".+(?= )") %>%       
                                       trimws( whitespace =" ") %>%
                                         trimws( whitespace = "\r")

             Next_Row_key <- Get_headers[20] %>%                     # Extracts the value of the Next Row Key
                                str_remove(".+(?= )") %>% 
                                  trimws( whitespace =" ") %>%
                                    trimws( whitespace = "\r")
   
             Next_URL <- paste0(                                     # Creates the URL for the Next Authentication token
                       "https://", 
                        Account, 
                        ".table.core.windows.net", 
                        "/", 
                        Container, 
                         "?", 
                       "NextPartitionKey=", 
                        Next_Partition_Key, 
                        "&NextRowKey=",
                        Next_Row_key
                    )
    
              next_xml_body = content(                            # Retrieves next 1000 rows of content from table
                                  GET(
                                    Next_URL, 
                                    config = header_string, 
                                    verbose()
                                   ),
                                  "text"
                                )
             
             Get_new_data <- next_xml_body                             # Gets data as text from API 
             From_JSON <-fromJSON(Get_new_data, flatten = TRUE)        # Parses text from JSON
             Temp_table_name <- as.data.frame(From_JSON)               # Saves data to a table
             Table_name <- bind_rows(Temp_table_name, Table_name)      # Appends new data to the initial data

             Get_new_headers <- capture.output(                        # Retrieves new next headers
                                 content(
                                   GET(
                                   Next_URL, 
                                   config = header_string, 
                                   verbose()
                                    )
                                   ),
                                 type = "message")
             
             New_server_response <- Get_new_headers[11] %>%
                                     trimws( whitespace = "\r") %>% 
                                      trimws( whitespace = "<- ") %>%
                                         grepl("HTTP/1.1 200 OK")
             
             # Checks if there are more than 1000 rows to be fetched
   
             New_x_ms_continuation_NextPartitionKey <- Get_new_headers[19] %>%                                              
                                                      trimws( whitespace = "<- ") %>%
                                                        gsub("\.*", "x-ms-continuation-NextPartitionKey") %>%
                                                          grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)

             New_x_ms_continuation_NextRowKey <- Get_new_headers[20] %>%
                                                trimws( whitespace = "<- ") %>% 
                                                  gsub("\.*", "x-ms-continuation-NextRowKey") %>%
                                                    grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
             
             x_ms_continuation_NextPartitionKey <- New_x_ms_continuation_NextPartitionKey
             x_ms_continuation_NextRowKey <- New_x_ms_continuation_NextRowKey
             
             Next_Partition_Key <- Get_new_headers[19] %>%               # Extracts the value of the Next Partition Key
                                     str_remove(".+(?= )") %>%       
                                       trimws( whitespace =" ") %>%
                                         trimws( whitespace = "\r")

             Next_Row_key <- Get_new_headers[20] %>%                     # Extracts the value of the Next Row Key
                                str_remove(".+(?= )") %>% 
                                  trimws( whitespace =" ") %>%
                                    trimws( whitespace = "\r")
             
   } 
   
} else {print("authentication failed")}

# Previews table
Pages
Next_headers_count
View(Table_name)

有了这个,我只能检索 2000 个条目。当下一个循环开始时,它失败了。好像这里失败了:

Get_new_headers <- capture.output(                        # Retrieves new next headers
                                 content(
                                   GET(
                                   Next_URL, 
                                   config = header_string, 
                                   verbose()
                                    )
                                   ),
                                 type = "message")

这是错误:

如有任何帮助,我们将不胜感激!

我明白了...我改进了脚本,现在它可以运行了。 :-)

代码可以进一步完善,但这是一个工作脚本,可以在迭代中从 table 检索所有数据。

Connects to an Azure Table based on the specifications for Shared Key: https://docs.microsoft.com/en-us/rest/api/storageservices/authorize-with-shared-key


library(httr)
library(RCurl)
library(bitops)
library(xml2)
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)

# Stores credentials in variable

Account <- "storage"
Container <- "Usage"
Key <- "key"


# Composes URL

URL <- paste0(
          "https://", 
          Account, 
          ".table.core.windows.net", 
          "/", 
          Container
        )

# Requests time stamp

requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")


# As per Microsoft's specs, an empty line is needed for content-length

content_lenght <- 0

# Composes signature string

signature_string <- paste0(
                     "GET", "\n",                 # HTTP Verb
                     "\n",                        # Content-MD-5
                     "text/xml", "\n",            # Content-Type
                     requestdate, "\n",           # Date
                     "/", Account, "/", Container # Canonicalized resource
                    )

# Composes header string

header_string <- add_headers(
                    Authorization=paste0(
                      "SharedKey ", 
                      Account, 
                      ":",
                      RCurl::base64(
                        digest::hmac(
                          key = RCurl::base64Decode(
                            Key, mode = "raw"
                          ),
                          object = enc2utf8(signature_string),
                          algo = "sha256", 
                          raw = TRUE
                            )
                        )
                    ),
                    'x-ms-date' = requestdate, 
                    'x-ms-version' = "2020-12-06",
                    'Content-type' = "text/xml"
                  )


# Calls


Get_headers <- capture.output(                                          # Retrieves metadata
                  content(
                    GET(
                      URL, 
                      config = header_string,
                      verbose()
                    )
                  ), 
                 type = "message"
               )       

Server_response <- Get_headers[11] %>%                                    # Retrieves server response
                         trimws( whitespace = "\r") %>% 
                            trimws( whitespace = "<- ") %>%
                               grepl("HTTP/1.1 200 OK")

Get_headers
Server_response

# Initializes counters

Pages <- 0
Next_headers_count <- 0

  while(isTRUE(Server_response)) {
    
        Pages <- Pages + 1
    
        xml_body <- content(                                       # Retrieves up to 1000 rows from the table
                        GET(
                          URL, 
                          config = header_string, 
                           verbose()
                         ), 
                        "text"
                      ) 
        
        Get_data <- xml_body                                      # Gets data as text from API  
        From_JSON <-fromJSON(Get_data, flatten = TRUE)            # Parses text from JSON
        Temp_table_name <- as.data.frame(From_JSON)               # Saves current rows to temp table
        Table_name <- bind_rows(Temp_table_name, Table_name)      # Appends new data to the initial data
    
        # Checks if there are more than 1000 rows to be fetched
    
        x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%                                              
                                                trimws( whitespace = "<- ") %>%
                                                  gsub("\.*", "x-ms-continuation-NextPartitionKey") %>%
                                                    grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
   
        x_ms_continuation_NextRowKey <- Get_headers[20] %>%
                                          trimws( whitespace = "<- ") %>% 
                                             gsub("\.*", "x-ms-continuation-NextRowKey") %>%
                                               grepl("x-ms-continuation-NextRowKey", fixed = TRUE) 
        
        x_ms_continuation_NextPartitionKey
        x_ms_continuation_NextRowKey
        
          if (isTRUE(x_ms_continuation_NextPartitionKey) & 
                 isTRUE(x_ms_continuation_NextRowKey)) {
          
                 Next_headers_count <- Next_headers_count + 1
          
                 Next_Partition_Key <- Get_headers[19] %>%                    # Extracts the value of the Next Partition Key
                                          str_remove(".+(?= )") %>%       
                                            trimws( whitespace =" ") %>%
                                              trimws( whitespace = "\r")
             
                 Next_Row_key <- Get_headers[20] %>%                          # Extracts the value of the Next Row Key
                                     str_remove(".+(?= )") %>% 
                                        trimws( whitespace =" ") %>%
                                           trimws( whitespace = "\r")
          
                 URL <- paste0(                                               # Creates the URL for the Next Authentication token
                         "https://", 
                          Account, 
                         ".table.core.windows.net", 
                         "/", 
                         Container, 
                         "?", 
                        "NextPartitionKey=", 
                         Next_Partition_Key, 
                         "&NextRowKey=",
                         Next_Row_key
                        )
           
               
                  Get_headers <- capture.output(                                          # Retrieves new metadata
                                       content(
                                             GET(
                                               URL, 
                                               config = header_string,
                                               verbose()
                                             )
                                          ), 
                                        type = "message"
                                   )       

                   Server_response <- Get_headers[11] %>%                                    # Retrieves new server response
                                          trimws( whitespace = "\r") %>% 
                                             trimws( whitespace = "<- ") %>%
                                                 grepl("HTTP/1.1 200 OK")
          }
        
  }

Pages
Next_headers_count
View(Table_name)