Python: 如何分割线合并一些线
Python: How to split lines merging some of the lines
我想逐行处理一个字符串,但我想启用多行支持。这是示例文本:
First line
Second line
{{{
these three lines
I want to process
together
}}}
Last Line
我希望多行从 {{{
开始并在 }}}
结束
我以前是这样逐行处理的:
lines = [l for l in text.splitlines()]
print lines
现在这段代码输出:
['First line', 'Second line', '{{{', 'these three lines', 'I want to process', 'together', '}}}', 'Last Line']
我想以某种方式使 lines
包含以下内容:
['First line', 'Second line', 'these three lines I want to process together', 'Last Line']
或者,更高级的示例
First Line
Second line
Third{{{line
fourth line
fifth}}}line
sixth line
在这种情况下,我希望行包含
['First Line', 'Second line', 'Third', 'line fourth line fifth', 'line', 'sixth line']
def split(text):
lines = []
while '{{{' in text:
head, sep, tail = text.partition('{{{')
lines.extend(head.splitlines())
head, sep, tail = tail.partition('}}}')
lines.append(head.replace('\n', ' ').strip())
text = tail
lines.extend(text.splitlines())
return lines
这是我的解决方案。它又长又简单。我希望也许有一种方法可以在几行内完成但它不会处理 }}}
和 {{{
在同一行的情况
def _split_with_merging(text):
lines = [l for l in text.splitlines() if l != ""]
nlines = []
multiline = False
for l in lines:
if multiline:
if "}}}" in l:
lparts = l.split("}}}")
nlines[len(nlines) - 1] += lparts[0]
if lparts[1] != "":
nlines.append(lparts[1])
multiline = False
else:
nlines[len(nlines) - 1] += l
else:
if "{{{" in l:
lparts = l.split("{{{")
nlines.append(lparts[0])
if lparts[1] != "":
nlines.append(lparts[1])
multiline = True
else:
nlines.append(l)
return nlines
您可以使用正则表达式,假设您对 {{{ }}}}
之间的行感兴趣
text = """First line
Second line
THIS{{{
these three lines
I want to process
together
}}}
Last Line"""
import re
match_obj = re.search('{{{(.*)}}}', text, re.DOTALL)
print match_obj.group(1)
或
r = re.compile('{{{(.*)}}}', flags=re.DOTALL)
print re.split(r, text)
# replace \n
split_list = re.split(r, text)
split_list = [l.replace('\n', '') for l in split_list]
print split_list
或
match_list = re.findall('{{{(.*)}}}', text, re.DOTALL)
match_list = [l.replace('\n', '') for l in match_list]
print match_list
如果您在给定文本中多次出现 {{{ }}}
,请通过添加“?”来使用 non-greedy 匹配例如{{{(.*?)}}}
我认为这是一个快速简单的解决方案,可以满足您要完成的任务:
text = """First line
Second line
{{{
these three lines
I want to process
together
}}}
Last Line"""
all_lines = [l for l in text.splitlines()]
final_list = []
nested = False
for line in all_lines:
if line == "{{{":
nested = True
multiline = ""
continue
elif line == "}}}":
nested = False
final_list.append(multiline)
continue
if nested == True:
multiline = multiline + " " + line
else:
final_list.append(line)
print(final_list)
可能不是有史以来最干净的代码,我认为我们应该用 .format()
替换 multiline = multiline + " " + line
,但我希望你明白了。
在带有 in_multi
标志的循环中跟踪开始 {{{
和结束 }}}
直截了当:
def split_multi(s):
lines = []
in_multi = False
for line in s.splitlines():
if in_multi:
if '}}}' in line:
in_multi = False
split = line.split('}}}')
if split[0]:
tmp.append(split[0])
lines.append(' '.join(tmp))
if split[-1]:
lines.append(split[-1])
else:
tmp.append(line)
else:
if '{{{' in line:
split = line.split('{{{')
in_multi = True
if split[0]:
lines.append(split[0])
if split[-1]:
tmp = [split[-1]]
else:
tmp = []
else:
lines.append(line)
return lines
s1 = """First line
Second line
{{{
these three lines
I want to process
together
}}}
Last Line"""
s2 = """First Line
Second line
Third{{{line
fourth line
fifth}}}line
sixth line"""
print(split_multi(s1))
print(split_multi(s2))
#['First Line', 'Second line', 'Third', 'line fourth line fifth', 'line', 'sixth line']
输出:
['First line', 'Second line', 'these three lines I want to process together', 'Last Line']
['First Line', 'Second line', 'Third', 'line fourth line fifth', 'line', 'sixth line']
使用正则表达式似乎是一个明智的解决方案 - 它使您可以灵活地选择两个输入选项
import re
only_line = '''First line
Second line
{{{
these three lines
I want to process
together
}}}
Last Line'''
mixed_line = '''First Line
Second line
Third{{{line
fourth line
fifth}}}line
sixth line'''
def curly_brackets(input_string):
# regex - we want to match text before the backets, text in the brackets, and text after the brackets as three groups
separate = list(re.findall('(.*)\{{3}(.*)\}{3}(.*)', input_string, re.DOTALL)[0])
# 1-indexed item will be the value between brackets - replace carriage returns with spaces
separate[1] = separate[1].replace('\n', ' ')
# split according to new lines - there will be none in our bracketed section
separate = [x.strip().split('\n') for x in separate]
# flatten the lists down - each element of separate is currently a list
return [x for sublist in separate for x in sublist]
print curly_brackets(only_line)
print curly_brackets(mixed_line)
这个returns:
['First line', 'Second line', 'these three lines I want to process together', 'Last Line']
['First Line', 'Second line', 'Third', 'line fourth line fifth', 'line', 'sixth line']
如果您有多组花括号,这将不起作用,但可以调整为以迭代方式应用。
这是一个将输入文件对象作为参数的生成器,一次生成一行。它应该在同一行接受尽可能多的 {{{
和 }}}
但不测试不平衡的结构:
def merge_lines(fd):
concat = False
for line in fd:
while True:
#print (line)
if len(line.strip()) == 0: break
if not concat:
if ('{{{' in line):
deb, line = line.split('{{{', 1)
yield deb
concat = True
old = None
else:
yield line.strip('\r\n')
line = ""
if concat:
if ('}}}' in line):
deb, line = line.split('}}}', 1)
concat = False
if old:
yield old.strip() + ' ' + deb
else: yield deb
else:
if old:
old += ' ' + line.strip('\r\n')
else:
old = line.strip('\r\n')
line = ""
Python3 中的示例:
>>> t = """First line
a{{{b}}}c{{{d
e
f}}}g{{{h
i}}}
j
k
"""
>>> for line in merge_lines(io.StringIO(t)): print(line)
First line
a
b
c
d e f
g
h i
j
k
我的 2 美分(使用 joint
):
ex1 = """First line
Second line
{{{
these three lines
I want to process
together
}}}
Last Line"""
ex2 = """First Line
Second line
Third{{{line
fourth line
fifth}}}line
sixth line"""
def parse_lines(txt, start_sep='{{{', end_sep='}}}'):
depth = 0 # 1+ if we are inside a {{{ group
# can be used to test unbalanced constructs
lines = []
current_line = ''
n = len(txt)
i = 0
while i < n:
c = txt[i]
not_handled = True
need_to_add = False
if c == '\n': # end of line
if depth == 0 : # save line and empty buffer
need_to_add = True
elif current_line != '': # add a space instead of the line break
current_line = ''.join((current_line,' '))
not_handled = False
i += 1
elif c == start_sep[0] and\
txt[i:i+len(start_sep)] == start_sep:
# ^ takes small advantage of lazy evaluation
# (see questions/13960657)
depth += 1
need_to_add = True
not_handled = False
i += len(start_sep)
elif c == end_sep[0] and\
txt[i:i+len(end_sep)] == end_sep:
depth -= 1
need_to_add = True
not_handled = False
i += len(end_sep)
if not_handled:
current_line = ''.join((current_line,c))
i += 1
elif need_to_add and current_line != '':
lines.append(current_line)
current_line = ''
if current_line != '': # add last line
lines.append(current_line)
return lines
哪个returns:
>>> parse_lines(ex1)
['First line', 'Second line', 'these three lines I want to process together ', 'Last Line']
>>> parse_lines(ex2)
['First Line', 'Second line', 'Third', 'line fourth line fifth', 'line', 'sixth line']
请注意第一个示例中以 '\n}}}'
结尾的多行中的额外 ' '
。
我想逐行处理一个字符串,但我想启用多行支持。这是示例文本:
First line
Second line
{{{
these three lines
I want to process
together
}}}
Last Line
我希望多行从 {{{
开始并在 }}}
结束
我以前是这样逐行处理的:
lines = [l for l in text.splitlines()]
print lines
现在这段代码输出:
['First line', 'Second line', '{{{', 'these three lines', 'I want to process', 'together', '}}}', 'Last Line']
我想以某种方式使 lines
包含以下内容:
['First line', 'Second line', 'these three lines I want to process together', 'Last Line']
或者,更高级的示例
First Line
Second line
Third{{{line
fourth line
fifth}}}line
sixth line
在这种情况下,我希望行包含
['First Line', 'Second line', 'Third', 'line fourth line fifth', 'line', 'sixth line']
def split(text):
lines = []
while '{{{' in text:
head, sep, tail = text.partition('{{{')
lines.extend(head.splitlines())
head, sep, tail = tail.partition('}}}')
lines.append(head.replace('\n', ' ').strip())
text = tail
lines.extend(text.splitlines())
return lines
这是我的解决方案。它又长又简单。我希望也许有一种方法可以在几行内完成但它不会处理 }}}
和 {{{
在同一行的情况
def _split_with_merging(text):
lines = [l for l in text.splitlines() if l != ""]
nlines = []
multiline = False
for l in lines:
if multiline:
if "}}}" in l:
lparts = l.split("}}}")
nlines[len(nlines) - 1] += lparts[0]
if lparts[1] != "":
nlines.append(lparts[1])
multiline = False
else:
nlines[len(nlines) - 1] += l
else:
if "{{{" in l:
lparts = l.split("{{{")
nlines.append(lparts[0])
if lparts[1] != "":
nlines.append(lparts[1])
multiline = True
else:
nlines.append(l)
return nlines
您可以使用正则表达式,假设您对 {{{ }}}}
text = """First line
Second line
THIS{{{
these three lines
I want to process
together
}}}
Last Line"""
import re
match_obj = re.search('{{{(.*)}}}', text, re.DOTALL)
print match_obj.group(1)
或
r = re.compile('{{{(.*)}}}', flags=re.DOTALL)
print re.split(r, text)
# replace \n
split_list = re.split(r, text)
split_list = [l.replace('\n', '') for l in split_list]
print split_list
或
match_list = re.findall('{{{(.*)}}}', text, re.DOTALL)
match_list = [l.replace('\n', '') for l in match_list]
print match_list
如果您在给定文本中多次出现 {{{ }}}
,请通过添加“?”来使用 non-greedy 匹配例如{{{(.*?)}}}
我认为这是一个快速简单的解决方案,可以满足您要完成的任务:
text = """First line
Second line
{{{
these three lines
I want to process
together
}}}
Last Line"""
all_lines = [l for l in text.splitlines()]
final_list = []
nested = False
for line in all_lines:
if line == "{{{":
nested = True
multiline = ""
continue
elif line == "}}}":
nested = False
final_list.append(multiline)
continue
if nested == True:
multiline = multiline + " " + line
else:
final_list.append(line)
print(final_list)
可能不是有史以来最干净的代码,我认为我们应该用 .format()
替换 multiline = multiline + " " + line
,但我希望你明白了。
在带有 in_multi
标志的循环中跟踪开始 {{{
和结束 }}}
直截了当:
def split_multi(s):
lines = []
in_multi = False
for line in s.splitlines():
if in_multi:
if '}}}' in line:
in_multi = False
split = line.split('}}}')
if split[0]:
tmp.append(split[0])
lines.append(' '.join(tmp))
if split[-1]:
lines.append(split[-1])
else:
tmp.append(line)
else:
if '{{{' in line:
split = line.split('{{{')
in_multi = True
if split[0]:
lines.append(split[0])
if split[-1]:
tmp = [split[-1]]
else:
tmp = []
else:
lines.append(line)
return lines
s1 = """First line
Second line
{{{
these three lines
I want to process
together
}}}
Last Line"""
s2 = """First Line
Second line
Third{{{line
fourth line
fifth}}}line
sixth line"""
print(split_multi(s1))
print(split_multi(s2))
#['First Line', 'Second line', 'Third', 'line fourth line fifth', 'line', 'sixth line']
输出:
['First line', 'Second line', 'these three lines I want to process together', 'Last Line']
['First Line', 'Second line', 'Third', 'line fourth line fifth', 'line', 'sixth line']
使用正则表达式似乎是一个明智的解决方案 - 它使您可以灵活地选择两个输入选项
import re
only_line = '''First line
Second line
{{{
these three lines
I want to process
together
}}}
Last Line'''
mixed_line = '''First Line
Second line
Third{{{line
fourth line
fifth}}}line
sixth line'''
def curly_brackets(input_string):
# regex - we want to match text before the backets, text in the brackets, and text after the brackets as three groups
separate = list(re.findall('(.*)\{{3}(.*)\}{3}(.*)', input_string, re.DOTALL)[0])
# 1-indexed item will be the value between brackets - replace carriage returns with spaces
separate[1] = separate[1].replace('\n', ' ')
# split according to new lines - there will be none in our bracketed section
separate = [x.strip().split('\n') for x in separate]
# flatten the lists down - each element of separate is currently a list
return [x for sublist in separate for x in sublist]
print curly_brackets(only_line)
print curly_brackets(mixed_line)
这个returns:
['First line', 'Second line', 'these three lines I want to process together', 'Last Line']
['First Line', 'Second line', 'Third', 'line fourth line fifth', 'line', 'sixth line']
如果您有多组花括号,这将不起作用,但可以调整为以迭代方式应用。
这是一个将输入文件对象作为参数的生成器,一次生成一行。它应该在同一行接受尽可能多的 {{{
和 }}}
但不测试不平衡的结构:
def merge_lines(fd):
concat = False
for line in fd:
while True:
#print (line)
if len(line.strip()) == 0: break
if not concat:
if ('{{{' in line):
deb, line = line.split('{{{', 1)
yield deb
concat = True
old = None
else:
yield line.strip('\r\n')
line = ""
if concat:
if ('}}}' in line):
deb, line = line.split('}}}', 1)
concat = False
if old:
yield old.strip() + ' ' + deb
else: yield deb
else:
if old:
old += ' ' + line.strip('\r\n')
else:
old = line.strip('\r\n')
line = ""
Python3 中的示例:
>>> t = """First line
a{{{b}}}c{{{d
e
f}}}g{{{h
i}}}
j
k
"""
>>> for line in merge_lines(io.StringIO(t)): print(line)
First line
a
b
c
d e f
g
h i
j
k
我的 2 美分(使用 joint
):
ex1 = """First line
Second line
{{{
these three lines
I want to process
together
}}}
Last Line"""
ex2 = """First Line
Second line
Third{{{line
fourth line
fifth}}}line
sixth line"""
def parse_lines(txt, start_sep='{{{', end_sep='}}}'):
depth = 0 # 1+ if we are inside a {{{ group
# can be used to test unbalanced constructs
lines = []
current_line = ''
n = len(txt)
i = 0
while i < n:
c = txt[i]
not_handled = True
need_to_add = False
if c == '\n': # end of line
if depth == 0 : # save line and empty buffer
need_to_add = True
elif current_line != '': # add a space instead of the line break
current_line = ''.join((current_line,' '))
not_handled = False
i += 1
elif c == start_sep[0] and\
txt[i:i+len(start_sep)] == start_sep:
# ^ takes small advantage of lazy evaluation
# (see questions/13960657)
depth += 1
need_to_add = True
not_handled = False
i += len(start_sep)
elif c == end_sep[0] and\
txt[i:i+len(end_sep)] == end_sep:
depth -= 1
need_to_add = True
not_handled = False
i += len(end_sep)
if not_handled:
current_line = ''.join((current_line,c))
i += 1
elif need_to_add and current_line != '':
lines.append(current_line)
current_line = ''
if current_line != '': # add last line
lines.append(current_line)
return lines
哪个returns:
>>> parse_lines(ex1)
['First line', 'Second line', 'these three lines I want to process together ', 'Last Line']
>>> parse_lines(ex2)
['First Line', 'Second line', 'Third', 'line fourth line fifth', 'line', 'sixth line']
请注意第一个示例中以 '\n}}}'
结尾的多行中的额外 ' '
。