从 Power Pivot 中提取 2000 万行 ("Item.data")

Rip 20 million rows from Power Pivot ("Item.data")

我收到了一本工作簿,其中包含两个 table 的 power-pivot(一个大约一百万行,另一个 20 行)。我想把它撕掉(真的 - 但让我们说一个 CSV)以便我可以在 R + PostGreSQL 中使用它。

我无法导出到 Excel table,因为有超过 100 万行;只有当我 select 大约 200,000 行时,复制粘贴数据才有效。
我尝试将 xlsx 转换为 zip 并在记事本 ++ 中打开“item.data”文件,但它已被加密。

我整理了一些 VBA,适用于大约 0.5 行:

Public Sub CreatePowerPivotDmvInventory()
    Dim conn As ADODB.Connection
    Dim sheet As Excel.Worksheet
    Dim wbTarget As Workbook
    On Error GoTo FailureOutput
     
    Set wbTarget = ActiveWorkbook
    wbTarget.Model.Initialize
    
    Set conn = wbTarget.Model.DataModelConnection.ModelConnection.ADOConnection

    ' Call function by passing the DMV name
    ' E.g. Partners
    WriteDmvContent "Partners", conn
     
    MsgBox "Finished"
    Exit Sub
     
FailureOutput:
    MsgBox Err.Description
End Sub
 
Private Sub WriteDmvContent(ByVal dmvName As String, ByRef conn As ADODB.Connection)
    Dim rs As ADODB.Recordset
    Dim mdx As String
    Dim i As Integer
 
    mdx = "EVALUATE " & dmvName
     
    Set rs = New ADODB.Recordset
    rs.ActiveConnection = conn
    rs.Open mdx, conn, adOpenForwardOnly, adLockOptimistic
     
    ' Setup CSV file (improve this code)
    Dim myFile As String
    myFile = "H:\output_table_" & dmvName & ".csv"
    Open myFile For Output As #1
    
    ' Output column names
    For i = 0 To rs.Fields.count - 1
        If i = rs.Fields.count - 1 Then
            Write #1, rs.Fields(i).Name
        Else
            Write #1, rs.Fields(i).Name,
        End If
    Next i

    ' Output of the query results
    Do Until rs.EOF
        For i = 0 To rs.Fields.count - 1
            If i = rs.Fields.count - 1 Then
                Write #1, rs.Fields(i)
            Else
                Write #1, rs.Fields(i),
            End If
        Next i
        rs.MoveNext
    Loop
    Close #1
    rs.Close
    Set rs = Nothing
    
    Exit Sub
 
FailureOutput:
    MsgBox Err.Description
End Sub

DAX Studio 将允许您查询 Excel 工作簿中的数据模型并输出为各种格式,包括平面文件。

您需要的查询是:

EVALUATE
<table name>

我找到了一个可行的 (VBA) 解决方案 [但 greggy 也对我有用!] -> 我的 table 太大而无法导出为一个块,所以我循环并过滤通过 'month'。这似乎有效并在我将所有内容附加在一起后生成 1.2 gb CSV:

Function YYYYMM(aDate As Date)
    YYYYMM = year(aDate) * 100 + month(aDate)
End Function

Function NextYYYYMM(YYYYMM As Long)
    If YYYYMM Mod 100 = 12 Then
        NextYYYYMM = YYYYMM + 100 - 11
    Else
        NextYYYYMM = YYYYMM + 1
    End If
End Function

Public Sub CreatePowerPivotDmvInventory()
    Dim conn As ADODB.Connection
    Dim tblname As String
    Dim wbTarget As Workbook
    On Error GoTo FailureOutput

    Set wbTarget = ActiveWorkbook
    wbTarget.Model.Initialize

    Set conn = wbTarget.Model.DataModelConnection.ModelConnection.ADOConnection

    ' Call function by passing the DMV name
    tblname = "table1"
    WriteDmvContent tblname, conn

    MsgBox "Finished"
    Exit Sub

FailureOutput:
    MsgBox Err.Description
End Sub

Private Sub WriteDmvContent(ByVal dmvName As String, ByRef conn As ADODB.Connection)
    Dim rs As ADODB.Recordset
    Dim mdx As String
    Dim i As Integer

    'If table small enough:
    'mdx = "EVALUATE " & dmvName

    'Other-wise filter:
    Dim eval_field As String
    Dim eval_val As Variant

    'Loop through year_month
    Dim CurrYM As Long, LimYM As Long
    Dim String_Date As String
    CurrYM = YYYYMM(#12/1/2000#)
    LimYM = YYYYMM(#12/1/2015#)
    Do While CurrYM <= LimYM

        String_Date = CStr(Left(CurrYM, 4)) + "-" + CStr(Right(CurrYM, 2))
        Debug.Print String_Date

        eval_field = "yearmonth"
        eval_val = String_Date
        mdx = "EVALUATE(CALCULATETABLE(" & dmvName & ", " & dmvName & "[" & eval_field & "] = """ & eval_val & """))"
        Debug.Print (mdx)

        Set rs = New ADODB.Recordset
        rs.ActiveConnection = conn
        rs.Open mdx, conn, adOpenForwardOnly, adLockOptimistic

        ' Setup CSV file (improve this code)
        Dim myFile As String
        myFile = "H:\vba_tbl_" & dmvName & "_" & eval_val & ".csv"
        Debug.Print (myFile)
        Open myFile For Output As #1

        ' Output column names
        For i = 0 To rs.Fields.count - 1
            If i = rs.Fields.count - 1 Then
                Write #1, """" & rs.Fields(i).Name & """"
            Else
                Write #1, """" & rs.Fields(i).Name & """",
            End If
        Next i

        ' Output of the query results
        Do Until rs.EOF
            For i = 0 To rs.Fields.count - 1
                If i = rs.Fields.count - 1 Then
                    Write #1, """" & rs.Fields(i) & """"
                Else
                    Write #1, """" & rs.Fields(i) & """",
                End If
            Next i
            rs.MoveNext
        Loop

    CurrYM = NextYYYYMM(CurrYM)
    i = i + 1

    Close #1
    rs.Close
    Set rs = Nothing
    Loop

    Exit Sub

FailureOutput:
    MsgBox Err.Description
End Sub

我修改了 mptevsion 脚本 - 现在它保存来自 table 的数据以每 n 行分隔 csv(默认 100k 行,可以通过更改 chunk_size 来更改)。 此脚本的优点是它不依赖于 table 中的任何字段来分隔数据,以实现它使用 TOPNSKIP (https://dax.guide/topnskip/).

Public Sub CreatePowerPivotDmvInventory()
    ActiveWorkbook.Model.Initialize
    
    Dim save_path As String
    Dim chunk_size As Long
    
    save_path = "H:\power pivot\csv"
    tblName = "data"
    chunk_size = 100000

    Dim rs As ADODB.Recordset
    Dim mdx As String
    Dim i As Long

    Dim rows_limit As Long
    Dim rows_left As Long
    
    Dim conn As ADODB.Connection
    Set conn = ActiveWorkbook.Model.DataModelConnection.ModelConnection.ADOConnection
    
    ' calculating number of rows in a table
    mdx = "evaluate {COUNTROWS('" & tblName & "')}"
    Set rs = New ADODB.Recordset
    rs.ActiveConnection = conn
    rs.Open mdx, conn, adOpenForwardOnly, adLockOptimistic
    rows_limit = rs.Fields(0)
    
    rows_left = rows_limit
    chunk_id = 1
    
    Do While rows_left > 0
        If rows_left < chunk_size Then
            chunk_size = rows_left
        End If
    
        mdx = "define var data_table = '" & tblName & "'" & Chr(10) & _
            "EVALUATE(" & Chr(10) & _
            "    TOPNSKIP(" & chunk_size & ", " & rows_limit - rows_left & ", data_table)" & Chr(10) & _
            ");"
        Debug.Print (mdx)

        Set rs = New ADODB.Recordset
        rs.ActiveConnection = conn
        rs.Open mdx, conn, adOpenForwardOnly, adLockOptimistic

        ' Setup CSV file (improve this code)
        Dim myFile As String
        myFile = save_path & "\vba_tbl_" & tblName & "_" & chunk_id & ".csv"
        Debug.Print (myFile)
        Open myFile For Output As #1

        ' Output column names
        For i = 0 To rs.Fields.Count - 1
            If i = rs.Fields.Count - 1 Then
                Write #1, """" & rs.Fields(i).Name & """"
            Else
                Write #1, """" & rs.Fields(i).Name & """",
            End If
        Next i

        ' Output of the query results
        Do Until rs.EOF
            For i = 0 To rs.Fields.Count - 1
                If i = rs.Fields.Count - 1 Then
                    Write #1, """" & rs.Fields(i) & """"
                Else
                    Write #1, """" & rs.Fields(i) & """",
                End If
            Next i
            rs.MoveNext
        Loop

        rows_left = rows_left - chunk_size
        chunk_id = chunk_id + 1

        Close #1
        rs.Close
        Set rs = Nothing
    Loop
        
    MsgBox "Finished"
    Exit Sub

FailureOutput:
    MsgBox Err.Description
End Sub