如何合并 python 中的多个字幕文件?
how to merge multiple subtitle files in python?
我有一批文件“srt”要合并
sub1.srt
1
00:00:21,601 --> 00:00:24,130
- What happened? - It's a mess, I heard.
2
00:00:24,131 --> 00:00:25,900
- What's that? - Dead bodies?
3
00:00:25,901 --> 00:00:28,839
- What's going on? - I wish I knew.
sub2.srt
1
00:00:28,840 --> 00:00:31,310
No one knows. They won't say.
2
00:00:31,311 --> 00:00:35,276
- My gosh. - How can so many die?
3
00:00:45,191 --> 00:00:46,556
When you starve,
合并后
1
00:00:21,601 --> 00:00:24,130
- What happened? - It's a mess, I heard.
2
00:00:24,131 --> 00:00:25,900
- What's that? - Dead bodies?
3
00:00:25,901 --> 00:00:28,839
- What's going on? - I wish I knew.
4
00:00:28,840 --> 00:00:31,310
No one knows. They won't say.
5
00:00:31,311 --> 00:00:35,276
- My gosh. - How can so many die?
6
00:00:45,191 --> 00:00:46,556
When you starve,
我发现这个脚本可以工作问题出在数字上字幕顺序不对
filenames = ['sub1.srt', 'sub2.srt']
with open('output_file.srt', 'w') as outfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outfile.write(line)
显示无序
1
2
3
1
2
3
如何修复?
您可以尝试以下方法:
import re
re_sub_no = re.compile(r"^\s*\d+\s*$", re.MULTILINE)
def repl(match):
global sub_no
sub_no += 1
return str(sub_no)
sub_no = 0
filenames = ["sub1.srt", "sub2.srt"]
with open("sub_merged.srt", "w") as fout:
for name in filenames:
with open(name, "r") as fin:
fout.write(re_sub_no.sub(repl, fin.read()) + "\n\n")
正则表达式模式 re_sub_no
正在搜索子编号,并通过 re.sub
repl
函数确保编号一致。 sub_no
变量被设为 global
是因为函数本身无法存储编号的当前状态。模式的 \s*
部分只是在空白 before/after 数字的情况下的预防措施,也许你不需要它们(没有尝试)。
上面的代码没有解决文件的编码问题。当您 运行 为您链接到的文件执行以下操作时
import chardet
for name in ["sub1.srt", "sub2.srt"]:
with open(name, "br") as file:
print(chardet.detect(file.read()))
结果很可能是
{'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''}
{'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''}
因此,以下修改应该会奏效:
...
with open("sub_merged.srt", "w", encoding="utf-8-sig") as fout:
for name in filenames:
with open(name, "r", encoding="utf-8-sig") as fin:
fout.write(re_sub_no.sub(repl, fin.read()) + "\n\n")
为了将来使用,您可以将其组合成:
import chardet, re
filenames = ["sub1.srt", "sub2.srt"]
encodings = []
for name in filenames:
with open(name, "br") as file:
encodings.append(chardet.detect(file.read())["encoding"])
re_sub_no = re.compile(r"^\d+\s*$", re.MULTILINE)
def repl(match):
global sub_no
sub_no += 1
return str(sub_no)
sub_no = 0
with open("sub_merged.srt", "w", encoding=encodings[0]) as fout:
for name, encoding in zip(filenames, encodings):
with open(name, "r", encoding=encoding) as fin:
fout.write(re_sub_no.sub(repl, fin.read()) + "\n\n")
此脚本会将第二个 SRT 文件中的字幕附加到第一个 SRT 文件中最接近的现有 时间。
需要 srt
模块,需要与 pip3 install srt
一起安装。
用法
# by default output to subs1_MERGED.srt
python3 srt_merge.py subs1.srt subs2.srt
# output to custom file name final.srt
python3 srt_merge.py subs1.srt subs2.srt -o final.srt
#!/usr/bin/env python3
import argparse
import sys
from datetime import timedelta
from pathlib import Path
# REQUIRED MODULE: pip3 install srt
import srt
def nearest(items, pivot):
return min(items, key=lambda x: abs(x.start - pivot))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='merge SRT subtitles',
usage="""
Merge SRT subtitles:
\t{0} first.srt second.srt -o merged.srt
""".format(Path(sys.argv[0]).name))
parser.add_argument('srt1',
metavar='srt1',
help='SRT-file-1')
parser.add_argument('srt2',
metavar='srt2',
help='SRT-file-2')
parser.add_argument('--output-file', '-o',
default=None,
help='Output filename')
parser.add_argument('--encoding', '-e',
default=None,
help='Input file encoding')
args = parser.parse_args(sys.argv[1:])
srt1_path = Path(args.srt1)
srt2_path = Path(args.srt2)
with srt1_path.open(encoding=args.encoding or 'utf-8') as fi1:
subs1 = {s.index: s for s in srt.parse(fi1)}
with srt2_path.open(encoding=args.encoding or 'utf-8') as fi2:
subs2 = {s.index: s for s in srt.parse(fi2)}
# iterate all subs in srt2 and find the closest EXISTING slot in srt1
sub: srt.Subtitle
start: int
for idx, sub in subs2.items():
start: timedelta = sub.start
sub_nearest_slot: srt.Subtitle = nearest(subs1.values(), start)
sub_nearest_slot.content = f'{sub_nearest_slot.content}<br>{sub.content}'
subs1[sub_nearest_slot.index] = sub_nearest_slot
if not args.output_file:
generated_srt = srt1_path.parent / (f'{srt1_path.stem}_MERGED_{srt1_path.suffix}')
else:
generated_srt = Path(args.output_file)
with generated_srt.open(mode='w', encoding='utf-8') as fout:
fout.write(srt.compose(list(subs1.values())))
我有一批文件“srt”要合并
sub1.srt
1
00:00:21,601 --> 00:00:24,130
- What happened? - It's a mess, I heard.
2
00:00:24,131 --> 00:00:25,900
- What's that? - Dead bodies?
3
00:00:25,901 --> 00:00:28,839
- What's going on? - I wish I knew.
sub2.srt
1
00:00:28,840 --> 00:00:31,310
No one knows. They won't say.
2
00:00:31,311 --> 00:00:35,276
- My gosh. - How can so many die?
3
00:00:45,191 --> 00:00:46,556
When you starve,
合并后
1
00:00:21,601 --> 00:00:24,130
- What happened? - It's a mess, I heard.
2
00:00:24,131 --> 00:00:25,900
- What's that? - Dead bodies?
3
00:00:25,901 --> 00:00:28,839
- What's going on? - I wish I knew.
4
00:00:28,840 --> 00:00:31,310
No one knows. They won't say.
5
00:00:31,311 --> 00:00:35,276
- My gosh. - How can so many die?
6
00:00:45,191 --> 00:00:46,556
When you starve,
我发现这个脚本可以工作问题出在数字上字幕顺序不对
filenames = ['sub1.srt', 'sub2.srt']
with open('output_file.srt', 'w') as outfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outfile.write(line)
显示无序
1
2
3
1
2
3
如何修复?
您可以尝试以下方法:
import re
re_sub_no = re.compile(r"^\s*\d+\s*$", re.MULTILINE)
def repl(match):
global sub_no
sub_no += 1
return str(sub_no)
sub_no = 0
filenames = ["sub1.srt", "sub2.srt"]
with open("sub_merged.srt", "w") as fout:
for name in filenames:
with open(name, "r") as fin:
fout.write(re_sub_no.sub(repl, fin.read()) + "\n\n")
正则表达式模式 re_sub_no
正在搜索子编号,并通过 re.sub
repl
函数确保编号一致。 sub_no
变量被设为 global
是因为函数本身无法存储编号的当前状态。模式的 \s*
部分只是在空白 before/after 数字的情况下的预防措施,也许你不需要它们(没有尝试)。
上面的代码没有解决文件的编码问题。当您 运行 为您链接到的文件执行以下操作时
import chardet
for name in ["sub1.srt", "sub2.srt"]:
with open(name, "br") as file:
print(chardet.detect(file.read()))
结果很可能是
{'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''}
{'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''}
因此,以下修改应该会奏效:
...
with open("sub_merged.srt", "w", encoding="utf-8-sig") as fout:
for name in filenames:
with open(name, "r", encoding="utf-8-sig") as fin:
fout.write(re_sub_no.sub(repl, fin.read()) + "\n\n")
为了将来使用,您可以将其组合成:
import chardet, re
filenames = ["sub1.srt", "sub2.srt"]
encodings = []
for name in filenames:
with open(name, "br") as file:
encodings.append(chardet.detect(file.read())["encoding"])
re_sub_no = re.compile(r"^\d+\s*$", re.MULTILINE)
def repl(match):
global sub_no
sub_no += 1
return str(sub_no)
sub_no = 0
with open("sub_merged.srt", "w", encoding=encodings[0]) as fout:
for name, encoding in zip(filenames, encodings):
with open(name, "r", encoding=encoding) as fin:
fout.write(re_sub_no.sub(repl, fin.read()) + "\n\n")
此脚本会将第二个 SRT 文件中的字幕附加到第一个 SRT 文件中最接近的现有 时间。
需要 srt
模块,需要与 pip3 install srt
一起安装。
用法
# by default output to subs1_MERGED.srt
python3 srt_merge.py subs1.srt subs2.srt
# output to custom file name final.srt
python3 srt_merge.py subs1.srt subs2.srt -o final.srt
#!/usr/bin/env python3
import argparse
import sys
from datetime import timedelta
from pathlib import Path
# REQUIRED MODULE: pip3 install srt
import srt
def nearest(items, pivot):
return min(items, key=lambda x: abs(x.start - pivot))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='merge SRT subtitles',
usage="""
Merge SRT subtitles:
\t{0} first.srt second.srt -o merged.srt
""".format(Path(sys.argv[0]).name))
parser.add_argument('srt1',
metavar='srt1',
help='SRT-file-1')
parser.add_argument('srt2',
metavar='srt2',
help='SRT-file-2')
parser.add_argument('--output-file', '-o',
default=None,
help='Output filename')
parser.add_argument('--encoding', '-e',
default=None,
help='Input file encoding')
args = parser.parse_args(sys.argv[1:])
srt1_path = Path(args.srt1)
srt2_path = Path(args.srt2)
with srt1_path.open(encoding=args.encoding or 'utf-8') as fi1:
subs1 = {s.index: s for s in srt.parse(fi1)}
with srt2_path.open(encoding=args.encoding or 'utf-8') as fi2:
subs2 = {s.index: s for s in srt.parse(fi2)}
# iterate all subs in srt2 and find the closest EXISTING slot in srt1
sub: srt.Subtitle
start: int
for idx, sub in subs2.items():
start: timedelta = sub.start
sub_nearest_slot: srt.Subtitle = nearest(subs1.values(), start)
sub_nearest_slot.content = f'{sub_nearest_slot.content}<br>{sub.content}'
subs1[sub_nearest_slot.index] = sub_nearest_slot
if not args.output_file:
generated_srt = srt1_path.parent / (f'{srt1_path.stem}_MERGED_{srt1_path.suffix}')
else:
generated_srt = Path(args.output_file)
with generated_srt.open(mode='w', encoding='utf-8') as fout:
fout.write(srt.compose(list(subs1.values())))