如何通过 C# 通过 OpenXML 从 Word(.Docx) 中提取 OLE 文件

How to extract OLE file from Word(.Docx) by OpenXML through C#

我想使用 Openxml".docx" 文件中提取 "OLE package"。我不知道该怎么做,而且我在官方示例中也没有找到任何相关示例。请帮助我。

这是我的尝试:

  1. 我用“MS office 2016”创建了一个名为"Test.docx"的Docx文件,并在"Test.docx"中插入了一个".zip"文件。我打开 "Open XML SDK 2.5 Productivity Tool" 观看 "Test.docx",我找到了这个(Figure 1),但是我没有得到任何关于如何通过反射代码提取这个 zip 文件的信息。

  2. 然后我尝试使用 C# 和 SharpCompress.dll 来提取这个 ".zip" 文件,接下来是代码:

     class Program
     {
         static void Main(string[] args)
         {
             string filepath = @"C:\Users\宇宙无敌帅小伙\Desktop\test.docx";
    
             OleFileTest(filepath);
         }
    
         public static void OleFileTest(string filepath)
         {
             try
             {
                 using (WordprocessingDocument Docx = WordprocessingDocument.Open(filepath, true))
                 {
                     Body body = Docx.MainDocumentPart.Document.Body;
    
                     IEnumerable<EmbeddedObjectPart> embd1 = Docx.MainDocumentPart.EmbeddedObjectParts;
    
                     int cnt = 0;
                     foreach (EmbeddedObjectPart item in embd1)
                     {
                         System.IO.Stream dt = item.GetStream(FileMode.OpenOrCreate);
                         BinaryWriter writer = new BinaryWriter(dt);
                         byte[] bt = new byte[dt.Length];
    
                         using (FileStream fs = File.Open($"C:\Users\宇宙无敌帅小伙\Desktop\{cnt}.zip", FileMode.Create, FileAccess.ReadWrite, FileShare.ReadWrite))
                         {
    
                             fs.Write(bt, 0, bt.Length);
                         }
                         cnt++;
                     }
                 }
             }
             catch (Exception e)
             {
                 Console.WriteLine(e.Message);
             }
         }
     }
    

但是我无法打开我提取的这个 ".zip" 文件。 有人可以帮我吗?非常感谢!

挑战在于,您从 EmbeddedObjectPart 中提取的二进制文件是 而不是 您的 ZIP 文件。这是一个结构化存储文件包含您的 ZIP 文件。

以下单元测试展示了如何使用 Microsoft Word 提取作为 OLE 对象嵌入到 Word 文档 ("Resources\ZipContainer.docx") 中的 ZIP 文件(例如 ZipContents.zip)。请注意 Ole10Native.ExtractFile() 方法的用法,该方法从嵌入在您的 Word 文档中的结构化存储文件(例如 oleObject1.bin)中提取 ZIP 文件。

using System.IO;
using CodeSnippets.Windows;
using DocumentFormat.OpenXml.Packaging;
using Xunit;

namespace CodeSnippets.Tests.OpenXml.Wordprocessing
{
    public class EmbeddedObjectPartTests
    {
        private static void ExtractFile(EmbeddedObjectPart part, string destinationFolderPath)
        {
            // Determine the file name and destination path of the binary,
            // structured storage file.
            string binaryFileName = Path.GetFileName(part.Uri.ToString());
            string binaryFilePath = Path.Combine(destinationFolderPath, binaryFileName);

            // Ensure the destination directory exists.
            Directory.CreateDirectory(destinationFolderPath);

            // Copy part contents to structured storage file.
            using (Stream partStream = part.GetStream())
            using (FileStream fileStream = File.Create(binaryFilePath))
            {
                partStream.CopyTo(fileStream);
            }

            // Extract the embedded file from the structured storage file.
            Ole10Native.ExtractFile(binaryFilePath, destinationFolderPath);

            // Remove the structured storage file.
            File.Delete(binaryFilePath);
        }

        [Fact]
        public void CanExtractEmbeddedZipFile()
        {
            const string documentPath = "Resources\ZipContainer.docx";
            const string destinationFolderPath = "Output";
            string destinationFilePath = Path.Combine(destinationFolderPath, "ZipContents.zip");

            using WordprocessingDocument wordDocument =
                WordprocessingDocument.Open(documentPath, false);

            // Extract all embedded objects.
            foreach (EmbeddedObjectPart part in wordDocument.MainDocumentPart.EmbeddedObjectParts)
            {
                ExtractFile(part, destinationFolderPath);
            }

            Assert.True(File.Exists(destinationFilePath));
        }
    }
}

这里是 Ole10Native class 的要点,它曾经由 Microsoft 发布,但现在有点难找:

using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Runtime.InteropServices.ComTypes;
using System.Text.RegularExpressions;

namespace CodeSnippets.Windows
{
    public class Ole10Native
    {
        public static void ExtractFile(string sourceFilePath, string destinationFolder)
        {
            StgOpenStorage(sourceFilePath, null, STGM.READWRITE | STGM.SHARE_EXCLUSIVE, IntPtr.Zero, 0, out IStorage iStorage);
            ProcessPackage(iStorage, destinationFolder);
            Marshal.ReleaseComObject(iStorage);
        }

        private static void ProcessPackage(IStorage pStg, string destinationFolder)
        {
            uint numReturned;
            pStg.EnumElements(0, IntPtr.Zero, 0, out IEnumSTATSTG pEnumStatStg);
            var ss = new STATSTG[1];

            // Loop through the STATSTG structures in the storage.
            do
            {
                // Retrieve the STATSTG structure
                pEnumStatStg.Next(1, ss, out numReturned);
                if (numReturned != 0)
                {
                    //System.Runtime.InteropServices.ComTypes.STATSTG statstm;
                    var bytT = new byte[4];

                    // Check if the pwcsName contains "Ole10Native" stream which contain the actual embedded object
                    if (ss[0].pwcsName.Contains("Ole10Native"))
                    {
                        // Get the stream objectOpen the stream
                        pStg.OpenStream(ss[0].pwcsName, IntPtr.Zero, (uint) STGM.READ | (uint) STGM.SHARE_EXCLUSIVE, 0,
                            out IStream pStream);

                        //pStream.Stat(out statstm, (int) STATFLAG.STATFLAG_DEFAULT);

                        IntPtr position = IntPtr.Zero;

                        // File name starts from 7th Byte.
                        // Position the cursor to the 7th Byte.
                        pStream.Seek(6, 0, position);

                        var ulRead = new IntPtr();
                        var filename = new char[260];
                        int i;

                        // Read the File name of the embedded object
                        for (i = 0; i < 260; i++)
                        {
                            pStream.Read(bytT, 1, ulRead);
                            pStream.Seek(0, 1, position);
                            filename[i] = (char) bytT[0];
                            if (bytT[0] == 0) break;
                        }

                        var path = new string(filename, 0, i);

                        // Next part is the source path of the embedded object.
                        // Length is unknown. Hence, loop through each byte to read the 0 terminated string
                        // Read the source path.
                        for (i = 0; i < 260; i++)
                        {
                            pStream.Read(bytT, 1, ulRead);
                            pStream.Seek(0, 1, position);
                            filename[i] = (char) bytT[0];
                            if (bytT[0] == 0) break;
                        }

                        // Unknown 4 bytes
                        pStream.Seek(4, 1, position);

                        // Next 4 byte gives the length of the temporary file path
                        // (Office uses a temporary location to copy the files before inserting to the document)
                        // The length is in little endian format. Hence conversion is needed
                        pStream.Read(bytT, 4, ulRead);
                        ulong dwSize = 0;
                        dwSize += (ulong) (bytT[3] << 24);
                        dwSize += (ulong) (bytT[2] << 16);
                        dwSize += (ulong) (bytT[1] << 8);
                        dwSize += bytT[0];

                        // Skip the temporary file path
                        pStream.Seek((long) dwSize, 1, position);

                        // Next four bytes gives the size of the actual data in little endian format.
                        // Convert the format.
                        pStream.Read(bytT, 4, ulRead);
                        dwSize = 0;
                        dwSize += (ulong) (bytT[3] << 24);
                        dwSize += (ulong) (bytT[2] << 16);
                        dwSize += (ulong) (bytT[1] << 8);
                        dwSize += bytT[0];

                        // Read the actual file content
                        var byData = new byte[dwSize];
                        pStream.Read(byData, (int) dwSize, ulRead);

                        // Create the file
                        var bWriter = new BinaryWriter(File.Open(Path.Combine(destinationFolder, GetFileName(path)),
                            FileMode.Create));
                        bWriter.Write(byData);
                        bWriter.Close();
                    }
                }
            } while (numReturned > 0);

            Marshal.ReleaseComObject(pEnumStatStg);
        }

        private static string GetFileName(string filePath)
        {
            return Regex.Replace(filePath, @"^.*[\]", "");
        }
    }
}

您可以在我的 CodeSnippets GitHub 存储库中找到完整的源代码(包括 Ole10Native class)。