从相应行 CSV Python 中获取值
Get values from corresponding row CSV Python
我有多个这样的 csv 文件:
csv1:
h1,h2,h3
aa,34,bd9
bb,459,jg0
csv2:
h1,h5,h2
aa,rg,87
aa,gru,90
bb,sf,459
对于第 0 列中带有 header h1 的每个值,我想从文件夹中的所有 csv 文件中获取其对应的 h2 值。样本输出可以是
csv1: (aa,34),(bb,459)
csv2: (aa,87,90),(bb,459)
我对如何着手做这件事有点无能为力。
PS- 我不想使用 pandas。
PPS- 我可以通过对第 0 列的值进行硬编码来实现,但我不想那样做,因为有数百行。
这是我试过的一小段代码。它在不同的行中打印 'aa' 的 h2 值。我希望它们打印在同一行。
import csv
with open("test1/sample.csv") as csvfile:
reader = csv.DictReader(csvfile, delimiter = ",")
for row in reader:
print(row['h1'], row['h2'])
import glob
import csv
import os
from collections import defaultdict
d = defaultdict(list)
path = "path_to_folder"
for fle in (glob.glob("*.csv")):
with open(os.path.join(path,fle)) as f:
header = next(f).rstrip().split(",")
# if either does not appear in header the value will be None
h1 = next((i for i, x in enumerate(header) if x == "h1"),None)
h2 = next((i for i, x in enumerate(header) if x == "h2"),None)
# make sure we have both columns before going further
if h1 is not None and h2 is not None:
r = csv.reader(f,delimiter=",")
# save file name as key appending each h1 and h2 value
for row in r:
d[fle].append([row[h1],row[h2]])
print(d)
defaultdict(<class 'list'>, {'csv1.csv': [['aa', '34'], ['bb', '459']], 'csv2.csv': [['aa', '87'], ['aa', '90'], ['bb', '459']]})
这是一个快速草稿,它假设所有文件都由 ,
分隔并且所有 h1 和 h2 列都有值,如果是这样它将找到所有保持顺序的配对。
要获得一组唯一值,我们可以使用集合和 set.update:
d = defaultdict(set) # change to set
for fle in (glob.glob("*.csv")):
with open(os.path.join(path,fle)) as f:
header = next(f).rstrip().split(",")
h1 = next((i for i, x in enumerate(header) if x == "h1"),None)
h2 = next((i for i, x in enumerate(header) if x == "h2"),None)
if h1 is not None and h2 is not None:
r = csv.reader(f,delimiter=",")
for row in r:
d[fle].update([row[h1],row[h2]) # set.update
print(d)
defaultdict(<class 'set'>, {'csv1.csv': {'459', '34', 'bb', 'aa'}, 'csv2.csv': {'459', '90', '87', 'bb', 'aa'}})
如果您确定始终有 h1 和 h2,则可以将代码简化为:
d = defaultdict(set)
path = "path/"
for fle in (glob.glob("*.csv")):
with open(os.path.join(path, fle)) as f:
r = csv.reader(f,delimiter=",")
header = next(r)
h1 = header.index("h1")
h2 = header.index("h2")
for row in r:
d[fle].update([row[h1], row[h2]])
最后,如果您想保持找到的元素的顺序,我们不能使用集合,因为它们是无序的,所以我们需要检查列表中是否已经存在任何一个元素:
for fle in (glob.glob("*.csv")):
with open(os.path.join(path, fle)) as f:
r = csv.reader(f,delimiter=",")
header = next(r)
h1 = header.index("h1")
h2 = header.index("h2")
for row in r:
h_1, h_2 = row[h1], row[h2]
if h_1 not in d[fle]:
d[fle].append(h_1)
if h_2 not in d[fle]:
d[fle].append(h_2)
print(d)
defaultdict(<class 'list'>, {'csv2.csv': ['aa', '87', '90', 'bb', '459'], 'csv1.csv': ['aa', '34', 'bb', '459']})
我有多个这样的 csv 文件:
csv1:
h1,h2,h3
aa,34,bd9
bb,459,jg0
csv2:
h1,h5,h2
aa,rg,87
aa,gru,90
bb,sf,459
对于第 0 列中带有 header h1 的每个值,我想从文件夹中的所有 csv 文件中获取其对应的 h2 值。样本输出可以是
csv1: (aa,34),(bb,459)
csv2: (aa,87,90),(bb,459)
我对如何着手做这件事有点无能为力。
PS- 我不想使用 pandas。
PPS- 我可以通过对第 0 列的值进行硬编码来实现,但我不想那样做,因为有数百行。
这是我试过的一小段代码。它在不同的行中打印 'aa' 的 h2 值。我希望它们打印在同一行。
import csv
with open("test1/sample.csv") as csvfile:
reader = csv.DictReader(csvfile, delimiter = ",")
for row in reader:
print(row['h1'], row['h2'])
import glob
import csv
import os
from collections import defaultdict
d = defaultdict(list)
path = "path_to_folder"
for fle in (glob.glob("*.csv")):
with open(os.path.join(path,fle)) as f:
header = next(f).rstrip().split(",")
# if either does not appear in header the value will be None
h1 = next((i for i, x in enumerate(header) if x == "h1"),None)
h2 = next((i for i, x in enumerate(header) if x == "h2"),None)
# make sure we have both columns before going further
if h1 is not None and h2 is not None:
r = csv.reader(f,delimiter=",")
# save file name as key appending each h1 and h2 value
for row in r:
d[fle].append([row[h1],row[h2]])
print(d)
defaultdict(<class 'list'>, {'csv1.csv': [['aa', '34'], ['bb', '459']], 'csv2.csv': [['aa', '87'], ['aa', '90'], ['bb', '459']]})
这是一个快速草稿,它假设所有文件都由 ,
分隔并且所有 h1 和 h2 列都有值,如果是这样它将找到所有保持顺序的配对。
要获得一组唯一值,我们可以使用集合和 set.update:
d = defaultdict(set) # change to set
for fle in (glob.glob("*.csv")):
with open(os.path.join(path,fle)) as f:
header = next(f).rstrip().split(",")
h1 = next((i for i, x in enumerate(header) if x == "h1"),None)
h2 = next((i for i, x in enumerate(header) if x == "h2"),None)
if h1 is not None and h2 is not None:
r = csv.reader(f,delimiter=",")
for row in r:
d[fle].update([row[h1],row[h2]) # set.update
print(d)
defaultdict(<class 'set'>, {'csv1.csv': {'459', '34', 'bb', 'aa'}, 'csv2.csv': {'459', '90', '87', 'bb', 'aa'}})
如果您确定始终有 h1 和 h2,则可以将代码简化为:
d = defaultdict(set)
path = "path/"
for fle in (glob.glob("*.csv")):
with open(os.path.join(path, fle)) as f:
r = csv.reader(f,delimiter=",")
header = next(r)
h1 = header.index("h1")
h2 = header.index("h2")
for row in r:
d[fle].update([row[h1], row[h2]])
最后,如果您想保持找到的元素的顺序,我们不能使用集合,因为它们是无序的,所以我们需要检查列表中是否已经存在任何一个元素:
for fle in (glob.glob("*.csv")):
with open(os.path.join(path, fle)) as f:
r = csv.reader(f,delimiter=",")
header = next(r)
h1 = header.index("h1")
h2 = header.index("h2")
for row in r:
h_1, h_2 = row[h1], row[h2]
if h_1 not in d[fle]:
d[fle].append(h_1)
if h_2 not in d[fle]:
d[fle].append(h_2)
print(d)
defaultdict(<class 'list'>, {'csv2.csv': ['aa', '87', '90', 'bb', '459'], 'csv1.csv': ['aa', '34', 'bb', '459']})