epub3:如何首先在存档中添加 mimetype
epub3 : how to add the mimetype at first in archive
我正在编写从 html 个文件创建 epub 的脚本,但是当我检查我的 epub 时出现以下错误:Mimetype entry missing or not the first in archive
存在 Mimetype,但它不是 epub 中的第一个文件。知道如何在任何情况下使用 Python 将它放在首位吗?
抱歉,我现在没有时间给出详细的解释,但这是我不久前写的一个(相对)简单的 epub 处理程序,它展示了如何做到这一点。
epubpad.py
#! /usr/bin/env python
''' Pad the the ends of paragraph lines in an epub file with a single space char
Written by PM 2Ring 2013.05.12
'''
import sys, re, zipfile
def bold(s): return "\x1b[1m%s\x1b[0m" % s
def report(attr, val):
print "%s '%s'" % (bold(attr + ':'), val)
def fixepub(oldname, newname):
oldz = zipfile.ZipFile(oldname, 'r')
nlist = oldz.namelist()
#print '\n'.join(nlist) + '\n'
if nlist[0] != 'mimetype':
print bold('Warning!!!'), "First file is '%s', not 'mimetype" % nlist[0]
#get the name of the contents file from the container
container = 'META-INF/container.xml'
# container should be in nlist
s = oldz.read(container)
p = re.compile(r'full-path="(.*?)"')
a = p.search(s)
contents = a.group(1)
#report("Contents file", contents)
i = contents.find('/')
if i>=0:
dirname = contents[:i+1]
else:
#No directory separator in contents name!
dirname = ''
report("dirname", dirname)
s = oldz.read(contents)
#print s
p = re.compile(r'<dc:creator.*>(.*)</dc:creator>')
a = p.search(s)
creator = a.group(1)
report("Creator", creator)
p = re.compile(r'<dc:title>(.*)</dc:title>')
a = p.search(s)
title = a.group(1)
report("Title", title)
#Find the names of all xhtml & html text files
p = re.compile(r'\.[x]?htm[l]?')
htmnames = [i for i in nlist if p.search(i) and i.find('wrap')==-1]
#Pattern for end of lines that don't need padding
eolp = re.compile(r'[>}]$')
newz = zipfile.ZipFile(newname, 'w', zipfile.ZIP_DEFLATED)
for fname in nlist:
print fname,
s = oldz.read(fname)
if fname == 'mimetype':
f = open(fname, 'w')
f.write(s)
f.close()
newz.write(fname, fname, zipfile.ZIP_STORED)
print ' * stored'
continue
if fname in htmnames:
print ' * text',
#Pad lines that are (hopefully) inside paragraphs...
newlines = []
for line in s.splitlines():
if len(line)==0 or eolp.search(line):
newlines.append(line)
else:
newlines.append(line + ' ')
s = '\n'.join(newlines)
newz.writestr(fname, s)
print
newz.close()
oldz.close()
def main():
oldname = len(sys.argv) > 1 and sys.argv[1]
if not oldname:
print 'No filename given!'
raise SystemExit
newname = len(sys.argv) > 2 and sys.argv[2]
if not newname:
if oldname.rfind('.') == -1:
newname = oldname + '_P'
else:
newname = oldname.replace('.epub', '_P.epub')
newname = newname.replace(' ', '_')
print "Processing '%s' to '%s' ..." % (oldname, newname)
fixepub(oldname, newname)
if __name__ == '__main__':
main()
FWIW,我编写了这个程序来处理我的简单 e-reader 文件,如果段落不以白色 space 结尾,它会烦人地将段落连接在一起 space。
我找到的解决方案:
删除之前的mimetype文件
在创建新存档时在添加任何其他内容之前创建一个新的 mimetype 文件:zipFile.writestr("mimetype", "application/epub+zip")
为什么有效:所有 epub 的 mimetype 都相同:"application/epub+zip",无需使用原始文件。
我正在编写从 html 个文件创建 epub 的脚本,但是当我检查我的 epub 时出现以下错误:Mimetype entry missing or not the first in archive
存在 Mimetype,但它不是 epub 中的第一个文件。知道如何在任何情况下使用 Python 将它放在首位吗?
抱歉,我现在没有时间给出详细的解释,但这是我不久前写的一个(相对)简单的 epub 处理程序,它展示了如何做到这一点。
epubpad.py
#! /usr/bin/env python
''' Pad the the ends of paragraph lines in an epub file with a single space char
Written by PM 2Ring 2013.05.12
'''
import sys, re, zipfile
def bold(s): return "\x1b[1m%s\x1b[0m" % s
def report(attr, val):
print "%s '%s'" % (bold(attr + ':'), val)
def fixepub(oldname, newname):
oldz = zipfile.ZipFile(oldname, 'r')
nlist = oldz.namelist()
#print '\n'.join(nlist) + '\n'
if nlist[0] != 'mimetype':
print bold('Warning!!!'), "First file is '%s', not 'mimetype" % nlist[0]
#get the name of the contents file from the container
container = 'META-INF/container.xml'
# container should be in nlist
s = oldz.read(container)
p = re.compile(r'full-path="(.*?)"')
a = p.search(s)
contents = a.group(1)
#report("Contents file", contents)
i = contents.find('/')
if i>=0:
dirname = contents[:i+1]
else:
#No directory separator in contents name!
dirname = ''
report("dirname", dirname)
s = oldz.read(contents)
#print s
p = re.compile(r'<dc:creator.*>(.*)</dc:creator>')
a = p.search(s)
creator = a.group(1)
report("Creator", creator)
p = re.compile(r'<dc:title>(.*)</dc:title>')
a = p.search(s)
title = a.group(1)
report("Title", title)
#Find the names of all xhtml & html text files
p = re.compile(r'\.[x]?htm[l]?')
htmnames = [i for i in nlist if p.search(i) and i.find('wrap')==-1]
#Pattern for end of lines that don't need padding
eolp = re.compile(r'[>}]$')
newz = zipfile.ZipFile(newname, 'w', zipfile.ZIP_DEFLATED)
for fname in nlist:
print fname,
s = oldz.read(fname)
if fname == 'mimetype':
f = open(fname, 'w')
f.write(s)
f.close()
newz.write(fname, fname, zipfile.ZIP_STORED)
print ' * stored'
continue
if fname in htmnames:
print ' * text',
#Pad lines that are (hopefully) inside paragraphs...
newlines = []
for line in s.splitlines():
if len(line)==0 or eolp.search(line):
newlines.append(line)
else:
newlines.append(line + ' ')
s = '\n'.join(newlines)
newz.writestr(fname, s)
print
newz.close()
oldz.close()
def main():
oldname = len(sys.argv) > 1 and sys.argv[1]
if not oldname:
print 'No filename given!'
raise SystemExit
newname = len(sys.argv) > 2 and sys.argv[2]
if not newname:
if oldname.rfind('.') == -1:
newname = oldname + '_P'
else:
newname = oldname.replace('.epub', '_P.epub')
newname = newname.replace(' ', '_')
print "Processing '%s' to '%s' ..." % (oldname, newname)
fixepub(oldname, newname)
if __name__ == '__main__':
main()
FWIW,我编写了这个程序来处理我的简单 e-reader 文件,如果段落不以白色 space 结尾,它会烦人地将段落连接在一起 space。
我找到的解决方案:
删除之前的mimetype文件
在创建新存档时在添加任何其他内容之前创建一个新的 mimetype 文件:
zipFile.writestr("mimetype", "application/epub+zip")
为什么有效:所有 epub 的 mimetype 都相同:"application/epub+zip",无需使用原始文件。