替换 CSV 文件中的模式 Python
Replace a pattern in CSV file Python
我有多个 CSV 文件,可以用多种方式表示相似的事物。例如,15 年可以写成 age: 15, age (years): 15, age: 15 years(这些是我到目前为止看到的所有模式)。我想用 15 年替换所有这些。当我知道实际年龄或列号时,我知道该怎么做,但每次出现的年龄肯定不同,并且列不固定。 csv 文件可能如下所示:
CSV1:
h1,h2,h3
A1,age:15,hh
B3,age:10,fg
需要 CSV1
h1,h2,h3
A1,15 years,hh
B3,10 years,fg
当它的法定年龄:15 岁时,绝对是年而不是月或任何其他单位。
像下面那样使用re.sub
,
re.sub(r'(,|^)(?:age\s*(?:\(years\))?:\s*(\d+)\s*(?:years)?)(?=,|$)',
r' years', string)
示例:
import re
import csv
with open('file') as f:
reader = csv.reader(f)
for i in reader:
print(re.sub(r'(,|^)(?:age\s*(?:\(years\))?:\s*(\d+)\s*(?:years)?)(?=,|$)', r' years', ','.join(i)))
输出:
h1,h2,h3
A1,15 years,hh
B3,10 years,fg
或
for i in reader:
print(re.sub(r'(,|^)[^,\n]*age\s*:[^,\n]*\b(\d+)\b[^,\n]*', r' years', ','.join(i)))
使用字符串模块中的翻译table 方法。
import csv
from string import maketrans
from string import ascii_uppercase, ascii_lowercase
delete = ascii_uppercase + ascii_lowercase + ":"
tran = maketrans("", "")
with open("infile.csv", "rb") as infile, open("output.csv", "wb") as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
#assuming the second field here
row[1] = row[1].translate(tran, delete) + " years"
writer.writerow(row)
我通常更喜欢 string.translate
而不是适用的正则表达式,因为它更容易理解和调试。
这是一个猜谜游戏,但如果规则是您要转换任何包含单词 "year" 和一些小数的东西,那么应该这样做。
import re
_is_age_search = re.compile(r"year|age", re.IGNORECASE).search
_find_num_search = re.compile(r"(\d+)").search
outdir = '/some/dir'
for filename in csv_filenames:
with open(filename) as f_in, open(os.path.join(outdir, filename), 'w') as f_out:
writer = csv.writer(f_out)
for row in csv.reader(f_in):
for i, val in enumerate(row):
if _is_age_search(val):
search = _find_num_search(val)
if search:
row[i] = "%d years" % search.groups()
writer.writerow(row)
我有多个 CSV 文件,可以用多种方式表示相似的事物。例如,15 年可以写成 age: 15, age (years): 15, age: 15 years(这些是我到目前为止看到的所有模式)。我想用 15 年替换所有这些。当我知道实际年龄或列号时,我知道该怎么做,但每次出现的年龄肯定不同,并且列不固定。 csv 文件可能如下所示:
CSV1:
h1,h2,h3
A1,age:15,hh
B3,age:10,fg
需要 CSV1
h1,h2,h3
A1,15 years,hh
B3,10 years,fg
当它的法定年龄:15 岁时,绝对是年而不是月或任何其他单位。
像下面那样使用re.sub
,
re.sub(r'(,|^)(?:age\s*(?:\(years\))?:\s*(\d+)\s*(?:years)?)(?=,|$)',
r' years', string)
示例:
import re
import csv
with open('file') as f:
reader = csv.reader(f)
for i in reader:
print(re.sub(r'(,|^)(?:age\s*(?:\(years\))?:\s*(\d+)\s*(?:years)?)(?=,|$)', r' years', ','.join(i)))
输出:
h1,h2,h3
A1,15 years,hh
B3,10 years,fg
或
for i in reader:
print(re.sub(r'(,|^)[^,\n]*age\s*:[^,\n]*\b(\d+)\b[^,\n]*', r' years', ','.join(i)))
使用字符串模块中的翻译table 方法。
import csv
from string import maketrans
from string import ascii_uppercase, ascii_lowercase
delete = ascii_uppercase + ascii_lowercase + ":"
tran = maketrans("", "")
with open("infile.csv", "rb") as infile, open("output.csv", "wb") as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
#assuming the second field here
row[1] = row[1].translate(tran, delete) + " years"
writer.writerow(row)
我通常更喜欢 string.translate
而不是适用的正则表达式,因为它更容易理解和调试。
这是一个猜谜游戏,但如果规则是您要转换任何包含单词 "year" 和一些小数的东西,那么应该这样做。
import re
_is_age_search = re.compile(r"year|age", re.IGNORECASE).search
_find_num_search = re.compile(r"(\d+)").search
outdir = '/some/dir'
for filename in csv_filenames:
with open(filename) as f_in, open(os.path.join(outdir, filename), 'w') as f_out:
writer = csv.writer(f_out)
for row in csv.reader(f_in):
for i, val in enumerate(row):
if _is_age_search(val):
search = _find_num_search(val)
if search:
row[i] = "%d years" % search.groups()
writer.writerow(row)