将交替的日期和价格列匹配到单个 table 中以供分析

Question

我一直在尝试正确清理和格式化原始商品价格数据以进行（时间序列）分析，我很好奇你们中的任何专业人士会如何处理这种设置。每两列代表一个日期列表和一个价格列表。这些日期（不幸的是）独立于同一行中的任何其他日期（尽管偶然可能有很多相同的日期）。

我的策略是创建一个新的数据框，其中行代表天数，列代表价格，运行一个循环将项目日期与正确的行匹配并填写正确的价格。

但是，我认为我执行此操作的效率可能很低，而且我的在线搜索没有提供此过程的其他示例。

请在下面找到示例代码。

    df <- structure(list(Date1 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item1 = c(650L, 650L, 635L, 640L, 640L, 625L, 620L, 580L, 550L, 520L, 530L), Date2 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item2 = c(590L, 590L, 590L, 580L, 580L, 580L, 580L, 580L, 460L, 460L, 395L), Date3 = c("12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012", "1/23/2012", "1/30/2012", "2/6/2012", "2/13/2012", "2/20/2012"), Item3 = c(775L, 775L, 775L, 750L, 750L, 750L, 750L, 750L, 725L, 725L, 740L), Date4 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item4 = c(660L, 700L, 700L, 700L, 700L, 700L, 650L, 650L, 650L, 650L, 610L), Date5 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item5 = c(705L, 705L, 705L, 650L, 650L, 650L, 650L, 555L, 555L, 555L, 555L), Date6 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item6 = c(612L, 612L, 612L, 612L, 612L, 612L, 612L, 612L, 612L, 612L, 612L), Date7 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item7 = c(630L, 630L, 625L, 635L, 625L, 615L, 620L, 560L, 550L, 540L, 530L), Date8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Item8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Date9 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item9 = c(622L, 622L, 650L, 650L, 650L, 660L, 660L, 660L, 665L, 665L, 665L), Date10 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item10 = c(1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L)), .Names = c("Date1", "Item1", "Date2", "Item2", "Date3", "Item3", "Date4", "Item4", "Date5", "Item5", "Date6", "Item6", "Date7", "Item7", "Date8", "Item8", "Date9", "Item9", "Date10", "Item10"), row.names = 95:105, class = "data.frame")
    df
    class(df)
    # visual inspection for first and last date (10/31/2011, 2/20/2012)

    mdyyyy <- function(x){as.Date(x,"%m/%d/%Y")}

    days <- seq.Date(from = mdyyyy("10/31/2011"), # first date
             to   = mdyyyy("2/20/2012"), # last date
             by   = "day")

    head(days)

    datecolumns <- seq(1,ncol(df),by=2) # (odds) date columns 
    pricecolumns <- seq(2,ncol(df),by=2) # (evens) index columns 

    # Creating a new, cleaned matrix of data where the 
    # rows = days and columns = indices
    newdat    <- matrix(NA, 
                length(days), 
                ncol(df[,pricecolumns])) # indices wide

    # Name rows
    rownames(newdat) <- format(days,"%m/%d/%Y")
    # Each row is a new day
    head(newdat[,1:10]) 

    # Placing prices into the appropriate rows
    for(i in 1:length(datecolumns)){
      pricedates <- 0   # initialize/reset
      pricedates <- mdyyyy(df[,datecolumns[i]]) # column's price dates
      rowlocations <- 0 # initialize/reset
      rowlocations <- match(pricedates, days)   # date's new row number
      for(j in 1:length(rowlocations)){
        # within each cell, place appropriate price
        newdat[rowlocations[j],i] <- df[j,pricecolumns[i]]
      }
    }
    colnames(newdat) <- colnames(df[,pricecolumns])
    head(newdat)

之后我一直在寻找 xts 包来帮助我将其变成我可以通过 apply.monthly() 和 rollapply() 进行分析的东西，因为原始数据要广泛得多。

非常感谢您的想法和批评。

Answer 1

这是一种使用数组索引的方法，这是用值填充矩阵的最有效方法，AFAIK：

## convert data to long format
long <- within(reshape(df,
                       varying       = list(datecolumns, pricecolumns),
                       v.names       = c('Date', 'Item'),
                       new.row.names = seq(prod(dim(df[datecolumns]))),
                       times         = paste0('Item', seq(datecolumns)),
                       timevar       = 'Id',
                       direction     = 'long')[-4],
               Date <- mdyyyy(Date))

long <- na.omit(long)                   # remove NAs

## create empty matrix
out <- matrix(NA, length(days), length(pricecolumns),
              dimnames=list(as.character(days), names(df)[pricecolumns]))

## fill it with values from long
out[with(long, cbind(as.character(Date), Id))] <- long$Item

Answer 2

不完全确定这是否是您想要的，但这里有一种方法使用 dplyr 和 tidyr 包将您的数据结构转换为具有单独 Date 和 Item 的长格式（我假设是价格）列。无论您想做什么，您都会发现使用它会更容易一些。注意，df是问题中提供的数据框。

library(tidyr)
library(dplyr)

d <- df %>%
  mutate(row = 1:n()) %>% 
  gather(key, value, -row) %>%
  extract(key, c("var", "ref"), "(Date|Item)([0-9]*)") %>%
  spread(var, value)

head(d)
#>   row ref       Date Item
#> 1   1   1 10/31/2011  650
#> 2   1  10 10/31/2011 1040
#> 3   1   2 10/31/2011  590
#> 4   1   3  12/5/2011  775
#> 5   1   4 10/31/2011  660
#> 6   1   5 10/31/2011  705

此外，这是基于对先前 post 的回答：Gather multiple sets of columns

如果你想把它展开成一个类似 table 的结构，这里和上面一样，只是多了几行：

d <- df %>%
  mutate(row = 1:n()) %>% 
  gather(key, value, -row) %>%
  extract(key, c("var", "ref"), "(Date|Item)([0-9]*)") %>%
  spread(var, value) %>%
  mutate(ref = paste0("Item", ref)) %>% 
  spread(ref, Item) %>% 
  select(-row)

head(d)
#>         Date Item1 Item10 Item2 Item3 Item4 Item5 Item6 Item7 Item8 Item9
#> 1 10/31/2011   650   1040   590  <NA>   660   705   612   630  <NA>   622
#> 2  12/5/2011  <NA>   <NA>  <NA>   775  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
#> 3       <NA>  <NA>   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
#> 4  11/7/2011   650   1040   590  <NA>   700   705   612   630  <NA>   622
#> 5 12/12/2011  <NA>   <NA>  <NA>   775  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
#> 6       <NA>  <NA>   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>

将交替的日期和价格列匹配到单个 table 中以供分析

Matching alternating date and price columns into a single table for analysis

r

date

matching

data-cleaning