自动使用 MS Word as PDF 到 HTML 转换器

Automating usage MS Word as PDF to HTML converter

我的任务是将数百个 PDF 文档转换为 HTML 格式。我尝试了多个独立转换器,但它们在处理列和连字符对齐方面存在问题。

但是,如果我只是在 MS Word 中打开文档,它会完美地处理它们。

所以,基本上,我需要一种方法来在 Word 中自动打开 PDF,等待它处理并将其另存为 HTML(过滤)。

有人知道吗,我该怎么做?

我们可以在 PowerShell 中完成。

将您的要求分成两部分

首先 - 将 PDF 转换为 WORD 文档

'**** The script runs in a loop until it detects a new file in the directory
'**** It checks the source folder every 30 seconds.  To change this interval
'**** change the sleep time at the end of the program to the number of seconds x 1000
'****  wscript.sleep (15000) would check the folder every 15 seconds
'****  
'****
'**** The program uses AnyBizSoft PDtoWord converter.  It is available as a free version off of
'***  facebook here - Facebook  - http://www.facebook.com/AnyBizSoft?v=app_6009294086
'***  You can buy it from thier website at http://www.anypdftools.com/pdf-to-word.html
'***  I have no connection with them other than they were the first one i found that worked with
'***  a command line.
'***
'***  The script uses two directories.  C:\Source\ is where pdf files are copied to
'***  C:\Converted is where the converted file is placed.  It is either a doc file if you have Office 2003 or
'***  older or a docx if you have Office 2007 or newer.
'***  After the file is converted the original pdf is deleted. This can be changed by commenting out the
'***  Line that deletes the file near the end of the script.
'***
'***  The script can be placed anywhere, but the pdftoword folder needs to be copied from the program files 
'***  directory to the c:\source folder
'***
'Option Explicit
Set objFSO = CreateObject("Scripting.FileSystemObject")
Dim strComputer
strComputer = "."
spath="C:\source\"     '*** Source directory
dpath="C:\converted\"  '*** Destination or Converted Directory

Set objWMIService = GetObject("winmgmts:" _
    & "{impersonationLevel=impersonate}!\" & _
        strComputer & "\root\cimv2")
Set colMonitoredEvents = objWMIService.ExecNotificationQuery _
    ("SELECT * FROM __InstanceCreationEvent WITHIN 10 WHERE " _
        & "Targetinstance ISA 'CIM_DirectoryContainsFile' and " _
            & "TargetInstance.GroupComponent= " _
                & "'Win32_Directory.Name=""c:\\source""'")
Do

  Do 
    set sourcefolder=objFso.GetFolder(spath)
        numfiles=sourcefolder.files.count
        set sourcefiles = sourcefolder.files
        for each objFile in sourcefiles
                sourcefile = objFile.name
      next 
  loop until (numfiles > 0)


'*** Call pdftoword to convert the file
     Set wshShell = WScript.CreateObject ("WSCript.shell")
     convertstr="c:\source\pdftoword\pdftoword.exe " & chr(34) & spath  & sourcefile & chr(34)
     wshshell.run convertstr, 6, false

Do      '*** Wait for docx to be created before continuing
    Set objLatestEvent = colMonitoredEvents.NextEvent
loop until (instr(objLatestEvent.TargetInstance.PartComponent,"doc")  > 0)           


'***  Make time stamp for file name
        d = Now 
    hhmmss = Right("00" & Hour(d), 2) & Right("00" & Minute(d), 2) & Right("00" & Second(d), 2)

'***  Get just the filename without the extension
  sourcefilename =  left(sourcefile,instr(sourcefile,".")-1)

'***  Add the timestamp to the converted file       
     newname = sourcefilename & "-" & hhmmss


'***  Exit program if file exists in the destination folder.  Highly unlikely since it is timestamped
if objfso.FileExists(dpath & newname & ".docx") then
        wscript.echo "Destination file " & dpath & newname & ".docx exists already"
        WScript.Quit 
end if
if objfso.FileExists("c:\converted\" & newname & ".doc") then
        wscript.echo "Destination file " &  dpath & newname & ".doc exists already"
        WScript.Quit 
end if

'*** move converted file to the converted folder then delete original
if objfso.FileExists(spath & sourcefilename & ".docx") then 
        newname= dpath & newname & ".docx"
        oldname = spath & sourcefilename & ".docx"
    objfso.Movefile  "" & oldname & "", "" & newname  & ""
    objfso.DeleteFile("" & spath & sourcefile &  "")   'Delete or comment this line if you do not want the original deleted
end if

if objfso.FileExists(spath & sourcefilename & ".doc") then 
        newname= dpath & newname & ".doc"
        oldname = spath & sourcefilename & ".doc"
    objfso.Movefile  "" & oldname & "", "" & newname  & ""
    objfso.DeleteFile("" & spath & sourcefile &  "")   'Delete or comment this line if you do not want the original deleted
end if


' *** Kill PDFtoword process
strProcessKill="PDFtoWord.exe"
Set colProcess = objWMIService.ExecQuery _
("Select * from Win32_Process Where Name = 'PDFtoWord.exe'"  )
For Each objProcess in colProcess
  objProcess.Terminate()
Next 

wscript.sleep (30000)  'Wait 30 seconds to look for next file.  1000 = 1 second
Loop

第二步 - 将 WORD 文档转换为 HTML

param([string]$docpath,[string]$htmlpath = $docpath)

$srcfiles = Get-ChildItem $docPath -filter "*.doc"
$saveFormat = [Enum]::Parse([Microsoft.Office.Interop.Word.WdSaveFormat], "wdFormatFilteredHTML");
$word = new-object -comobject word.application
$word.Visible = $False

function saveas-filteredhtml
    {
        $opendoc = $word.documents.open($doc.FullName);
        $opendoc.saveas([ref]"$htmlpath$doc.fullname.html", [ref]$saveFormat);
        $opendoc.close();
    }

ForEach ($doc in $srcfiles)
    {
        Write-Host "Processing :" $doc.FullName
        saveas-filteredhtml
        $doc = $null
    }

$word.quit();

将此代码保存到 convertdoc-tohtml.ps1,您可以 运行 在一组 word 文档上使用它,而不管扩展名是 doc 还是 docx。

运行 方法如下:

convertdoc-tohtml.ps1 -docpath "C:\Documents" -htmlpath "C:\Output"