
Distance & cluster with dynamic time warping

我正在使用 dtw 计算几个系列之间的距离并得到奇怪的结果。请注意,在下面的示例数据中,前 9 个客户是相同的集合(A==B==C、D==E==F 和 G==H==I)。剩余的行仅用于噪音,让我可以制作 8 个簇。

我预计第一组将与他们相同的合作伙伴聚集在一起。当我计算原始数据的距离时会发生这种情况,但是当我在 distance/clustering 之前缩放数据时,我会得到不同的结果。

原始数据中相同行之间的距离为 0.0(如预期),但对于缩放数据,距离不是 0.0(甚至不接近)。知道为什么它们不一样吗?


   mydata = as_data_frame(read.table(textConnection("
cust   P1   P2   P3   P4  P5   P6   P7   P8   P9  P10
1     A  1.1  1.0  1.0  1.0 1.0  1.0  1.0  1.0  1.0  1.0
2     B  1.1  1.0  1.0  1.0 1.0  1.0  1.0  1.0  1.0  1.0
3     C  1.1  1.0  1.0  1.0 1.0  1.0  1.0  1.0  1.0  1.0
4     D  0.0  1.0  2.0  1.0 0.0  1.0  2.0  1.0  0.0  1.0
5     E  0.0  1.0  2.0  1.0 0.0  1.0  2.0  1.0  0.0  1.0
6     F  0.0  1.0  2.0  1.0 0.0  1.0  2.0  1.0  0.0  1.0
7     G  2.0  1.5  1.0  0.5 0.0  0.5  1.0  1.5  2.0  1.5
8     H  2.0  1.5  1.0  0.5 0.0  0.5  1.0  1.5  2.0  1.5
9     I  2.0  1.5  1.0  0.5 0.0  0.5  1.0  1.5  2.0  1.5
10   D2  1.0  2.0  1.0  0.0 1.0  2.0  1.0  0.0  1.0  2.0
11   E2  5.0  6.0  5.0  4.0 5.0  6.0  5.0  4.0  5.0  6.0
12   F2  9.0 10.0  9.0  8.0 9.0 10.0  9.0  8.0  9.0 10.0
13   G2  1.5  1.0  0.5  0.0 0.5  1.0  1.5  2.0  1.5  1.0
14   H2  5.5  5.0  4.5  4.0 4.5  5.0  5.5  6.0  5.5  5.0
15   I2  9.5  9.0  8.5  8.0 8.5  9.0  9.5 10.0  9.5  9.0
16   A3  1.0  1.0  0.0  2.0 1.0  1.0  1.0  1.0  1.0  1.0
17   B3  5.0  5.0  5.0  5.0 5.0  3.0  8.0  5.0  5.0  5.0
18   C3  9.0  9.0  9.0  9.0 9.0  5.4 14.4  9.0  9.0  9.0
19   D3  0.0  1.0  2.0  1.0 0.0  1.0  1.0  2.0  0.0  1.0
20   E3  4.0  5.0  5.0  6.0 4.0  5.0  6.0  5.0  4.0  5.0
21   F3  8.0  9.0 10.0  9.0 9.0  9.0  9.0  9.0  8.0  9.0
22   G3  2.0  1.5  1.0  0.5 0.0  0.5  1.0  2.0  1.5  1.5
23   H3  6.0  5.5  5.0  4.5 4.0  5.0  4.5  5.5  6.0  5.5
24   I3 10.0  9.5  9.0  9.0 8.0  8.5  9.0  9.5 10.0  9.5
25   D4  0.0  3.0  6.0  3.0 0.0  3.0  6.0  3.0  0.0  5.0
26   E4  3.0  6.0  9.0  6.0 3.0  6.0  9.0  6.0  3.0  6.0
27   F4  4.0  6.0 10.0  7.0 5.0  6.0 11.0  8.0  5.0  7.0
28   D5  5.0  0.0  3.0  6.0 3.0  0.0  3.0  6.0  3.0  0.0
29   D6  9.0  6.0  3.0  6.0 9.0  6.0  3.0  6.0  9.0  6.0
30   D7  9.0 11.0  5.0  4.0 6.0 10.0  7.0  5.0  6.0 11.0
31   Dw  0.0  0.8  1.4  2.0 1.0  0.0  2.0  0.0  1.0  2.0
32   Ew  4.0  4.8  5.4  6.0 5.0  4.0  6.0  4.0  5.0  6.0
33   Fw  8.0  8.8  9.4 10.0 9.0  8.0 10.0  8.0  9.0 10.0
34   Gw  2.0  1.5  1.0  0.5 0.0  1.0  2.0  1.5  1.3  1.1
35   Hw  6.0  5.5  5.0  4.5 4.0  5.0  6.0  5.5  5.3  5.1
36   Iw 10.0  9.5  9.0  8.5 8.0  9.0 10.0  9.5  9.3  9.1"),
                           header = TRUE, stringsAsFactors = FALSE))

# create a scale version of mydata (raw data - mean) / std dev
      mydata_long = mydata %>%
            mutate (mean = apply(mydata[,2:ncol(mydata)],1,mean,na.rm = T)) %>%
            mutate (sd = apply(mydata[,2:(ncol(mydata))],1,sd,na.rm = T))%>%
            gather (period,value,-cust,-mean,-sd) %>%
            mutate (sc = (value-mean)/sd)
      mydata_sc = mydata_long[,-c(2,3,5)] %>%
  # dtw
        dtw_dist = TSDatabaseDistances(mydata[2:ncol(mydata)], distance = "dtw",lag.max= 2) #distance
        dtw_clus = hclust(dtw_dist, method="ward.D2") # Cluster 
        dtw_res = data.frame(cutree(dtw_clus, k)) # cut dendrogram into 9 clusters
  # dtw (w scaled data)
        dtw_sc_dist = TSDatabaseDistances(mydata_sc[2:ncol(mydata_sc)], distance = "dtw",lag.max= 2) #distance
        dtw_sc_clus = hclust(dtw_sc_dist, method="ward.D2") # Cluster      
        dtw_sc_res = data.frame(cutree(dtw_sc_clus, k)) # cut dendrogram into 9 clusters

results = cbind (dtw_res,dtw_sc_res)
  names(results) = c("dtw", "dtw_scaled")


   dtw dtw_scaled
1    1          1
2    1          2
3    1          1
4    1          2
5    1          1
6    1          2
7    1          3
8    1          4
9    1          3
10   1          3
11   2          3
12   3          4
13   1          5
14   2          6
15   3          3
16   1          4
17   2          3
18   4          3
19   1          6
20   2          3
21   3          4
22   1          3
23   2          3
24   3          6
25   5          7
26   6          8
27   7          7
28   5          7
29   6          7
30   8          8
31   1          7
32   2          7
33   3          7
34   1          8
35   2          7
36   3          7


  1. 您按行缩放,而不是按列缩放(看看您的 dplyr 链的中间结果——它们有意义吗?)

  2. 您用于生成缩放数据的数据操作将数据框的行顺序更改为按字母顺序排列:

    > mydata_sc %>% head
         cust          P1          P2          P3          P4          P5         P6         P7          P8          P9         P10
      (chr)       (dbl)       (dbl)       (dbl)       (dbl)       (dbl)      (dbl)      (dbl)       (dbl)       (dbl)       (dbl)
      1     A  2.84604989 -0.31622777 -0.31622777 -0.31622777 -0.31622777 -0.3162278 -0.3162278 -0.31622777 -0.31622777 -0.31622777
      2    A3  0.00000000  0.00000000 -2.12132034  2.12132034  0.00000000  0.0000000  0.0000000  0.00000000  0.00000000  0.00000000
      3     B  2.84604989 -0.31622777 -0.31622777 -0.31622777 -0.31622777 -0.3162278 -0.3162278 -0.31622777 -0.31622777 -0.31622777


      > mydata %>% head
      Source: local data frame [6 x 11]
             cust    P1    P2    P3    P4    P5    P6    P7    P8    P9   P10
      (chr) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
      1     A   1.1     1     1     1     1     1     1     1     1     1
      2     B   1.1     1     1     1     1     1     1     1     1     1

(检查 cust 变量排序!)


  1. 内置比例尺功能的比例尺

    mydata_sc <- mydata %>% select(-cust) %>% scale %>% as.data.frame %>% cbind(cust =mydata$cust,.) %>% as.tbl
  2. 断言您的缩放数据框等同于原始数据框的缩放版本:

    > (scale(mydata_sc %>% select(-cust)) - scale(mydata %>% select(-cust))) 
        %>% colSums %>% sum
    [1] 0.000000000000005353357 
  3. 创建 一个 单一函数来执行您想要的操作:

    return_dtw <- function(df) {
      res_2 = TSDatabaseDistances(df[2:ncol(df)],distance="dtw",lag.max=2) %>%
  4. 执行函数:

      > mydata %>% return_dtw %>% cbind(mydata_sc %>% return_dtw)
      cutree.res_2..k. cutree.res_2..k.
      1                 1                1
      2                 1                1
      3                 1                1
      4                 1                1
      5                 1                1
      6                 1                1
      7                 1                1
      8                 1                1
      9                 1                1
      10                1                1
      11                2                2
      12                3                3
      13                1                1
      14                2                2
      15                3                3
      16                1                1
      17                2                2
      18                4                3
      19                1                1
      20                2                2
      21                3                3
      22                1                1
      23                2                2
      24                3                3
      25                5                4
      26                6                5
      27                7                5
      28                5                6
      29                6                7
      30                8                8
      31                1                1
      32                2                2
      33                3                3
      34                1                1
      35                2                2
      36                3                3
