Python 多个键并转换为字典
Python Multiple Keys and Transform to Dictionary
我正在 Python 2.6.6 中导入一个 txt 文件,需要进行一些数据整理。我是 Python 的新手,正在努力 google 每一步来完成任务。你能帮忙或建议吗?
这是我的输入 myData.txt,如下所示。 header 不在数据中,但我把它放在这里以便于阅读。
key1|key2|group|v1|v2|v3|v4
1|A|-1|10|100|1|2
1|A|2|20|35|2|3
1|B|1|15|5|3|5
2|B|5|23|25|4|2
2|B|2|33|20|22|98
2|D|4|23|21|20|32
...
这是我想要的熊猫数据框中的输出,如下所示。基本上,我想合并 key1 和 key2 并形成一个组合键,并将 group、v1 和 v2 放入字典中,以 group 为键,v1 v2 作为列表中的值(v1 是第一个元素,v2作为第二个元素)。我不需要输出中的 v3 或 v4。
comboKey1 new_v1
1_A {"-1":[10,100], "2":[20,35]}
1_B {"1":[15,5]}
2_B {"2":[33,20], "5":[23,25]}
2_D {"4":[23,21]}
这是我现在拥有的。有人可以建议吗?
import pandas as pd
df1 = pd.read_csv('myData.txt', header=None, sep='|')
df1.columns = ('key1','key2','group','v1','v2')
df1['comboKey1'] = df1['key1'].map(str)+"_"+df1['key2']
import pandas as pd
# Reading file, 'r' -> read
file = open('data.txt', 'r')
lines = file.readlines()
# Fict where info will be stored
main_dict = {}
for line in lines:
# Getting the list of values in the line
# values -> [key1, key2, group, v1, v2, v3, v4]
# indexs -> 0 1 2 3 4 5 6
values = line.split('|')
#creating combo_key
combo_key = str(values[0])+"_"+str(values[1])
#tests if key already exists
#if not, creats a new dict into it
if combo_key not in main_dict.keys():
main_dict[combo_key] = {} #adding new dict to dict key
main_dict[combo_key][str(values[2])] = [values[3], values[4]]
data = []
for key in main_dict.keys():
data.append([key, str(main_dict[key])])
df = pd.DataFrame(data, columns = ['ComboKey1', "new_v1"])
print(df)
直接对dict进行排序,然后(:
如果只是达到想要的预期输出,那么下面的代码也可以应用。
import pandas as pd
from io import StringIO
YOUR_TXT_DATA = """\
1|A|-1|10|100|1|2
1|A|2|20|35|2|3
1|B|1|15|5|3|5
2|B|5|23|25|4|2
2|B|2|33|20|22|98
2|D|4|23|21|20|32
"""
df = pd.read_csv(StringIO(YOUR_TXT_DATA), header=None,
usecols=[_ for _ in range(0, 5)],
names=['key1', 'key2', 'group', 'v1', 'v2'],
sep='|')
result_dict = dict(comboKey1=[], new_v1=[])
for key1, key2, group, v1, v2 in df.values:
key = str(key1) + '_' + str(key2)
if key not in result_dict['comboKey1']:
result_dict['comboKey1'].append(key)
result_dict['new_v1'].append({str(group): [v1, v2]})
else:
index = result_dict['comboKey1'].index(key)
result_dict['new_v1'][index].update({str(group): [v1, v2]})
result_df = pd.DataFrame.from_dict(result_dict)
print(result_df)
输出
comboKey1 new_v1
0 1_A {'-1': [10, 100], '2': [20, 35]}
1 1_B {'1': [15, 5]}
2 2_B {'5': [23, 25], '2': [33, 20]}
3 2_D {'4': [23, 21]}
关于测试数据
我认为有一些特殊情况需要你考虑,假设数据如下。
key1|key2|group|v1|v2|v3|v4
1|A|-1|10|100|1|2
1|A|-1|10|100|1|2
1|A|-1|20|35|2|3
您的预期输出是多少? (案例1~3)
- 情况一:以最后为准。
1_A {'-1': [20, 35]}
(解决方案:字典)
- 情况 2:保留所有但不重复:
{('-1', (10, 100)), ('-1', (20, 35))}
(解决方案:设置)
- 情况 3:保留所有
1_A [('-1', (10, 100)), ('-1', (10, 100)), ('-1', (20, 35))]
(解决方案:列表)
代码:
from unittest import TestCase
import pandas as pd
from io import StringIO
OTHER_TXT_DATA = """\
1|A|-1|10|100|1|2
1|A|-1|10|100|1|2
1|A|-1|20|35|2|3
"""
class MyTests(TestCase):
def __init__(self, *args, **options):
super().__init__(*args, **options)
self.df = pd.read_csv(StringIO(OTHER_TXT_DATA), header=None,
usecols=[_ for _ in range(0, 5)],
names=['key1', 'key2', 'group', 'v1', 'v2'],
sep='|')
def setUp(self) -> None:
# init on every test case.
self.result_dict = dict(comboKey1=[], new_v1=[])
def solution_base(self, new_v1_fun, update_v1_fun) -> pd.DataFrame:
result_dict = self.result_dict
for key1, key2, group, v1, v2 in self.df.values:
key = str(key1) + '_' + str(key2)
if key not in result_dict['comboKey1']:
result_dict['comboKey1'].append(key)
new_v1_fun(group, v1, v2) # result_dict['new_v1'].append({str(group): [v1, v2]})
else:
index = result_dict['comboKey1'].index(key)
update_v1_fun(index, group, v1, v2) # result_dict['new_v1'][index].update({str(group): [v1, v2]})
df = pd.DataFrame.from_dict(result_dict)
print(df)
return df
def test_case_1_dict(self):
df = self.solution_base(new_v1_fun=lambda group, v1, v2: self.result_dict['new_v1'].append({str(group): [v1, v2]}),
update_v1_fun=lambda index, group, v1, v2: self.result_dict['new_v1'][index].update({str(group): [v1, v2]}))
self.assertTrue(df.equals(pd.DataFrame(
columns=['comboKey1', 'new_v1'],
data=[
['1_A', {'-1': [20, 35]}],
]
)))
def test_case_2_set(self):
df = self.solution_base(new_v1_fun=lambda group, v1, v2: self.result_dict['new_v1'].append({(str(group), (v1, v2))}),
update_v1_fun=lambda index, group, v1, v2: self.result_dict['new_v1'][index].add((str(group), (v1, v2))))
self.assertTrue(df.equals(pd.DataFrame(
columns=['comboKey1', 'new_v1'],
data=[
['1_A', {('-1', (20, 35)), ('-1', (10, 100))}],
]
)))
def test_case_3_list(self):
df = self.solution_base(new_v1_fun=lambda group, v1, v2: self.result_dict['new_v1'].append([(str(group), (v1, v2))]),
update_v1_fun=lambda index, group, v1, v2: self.result_dict['new_v1'][index].append((str(group), (v1, v2))))
self.assertTrue(df.equals(pd.DataFrame(
columns=['comboKey1', 'new_v1'],
data=[
['1_A', [('-1', (10, 100)), ('-1', (10, 100)), ('-1', (20, 35))]],
]
)))
注意:不支持注释(见PEP484)Python 2.
我正在 Python 2.6.6 中导入一个 txt 文件,需要进行一些数据整理。我是 Python 的新手,正在努力 google 每一步来完成任务。你能帮忙或建议吗?
这是我的输入 myData.txt,如下所示。 header 不在数据中,但我把它放在这里以便于阅读。
key1|key2|group|v1|v2|v3|v4
1|A|-1|10|100|1|2
1|A|2|20|35|2|3
1|B|1|15|5|3|5
2|B|5|23|25|4|2
2|B|2|33|20|22|98
2|D|4|23|21|20|32
...
这是我想要的熊猫数据框中的输出,如下所示。基本上,我想合并 key1 和 key2 并形成一个组合键,并将 group、v1 和 v2 放入字典中,以 group 为键,v1 v2 作为列表中的值(v1 是第一个元素,v2作为第二个元素)。我不需要输出中的 v3 或 v4。
comboKey1 new_v1
1_A {"-1":[10,100], "2":[20,35]}
1_B {"1":[15,5]}
2_B {"2":[33,20], "5":[23,25]}
2_D {"4":[23,21]}
这是我现在拥有的。有人可以建议吗?
import pandas as pd
df1 = pd.read_csv('myData.txt', header=None, sep='|')
df1.columns = ('key1','key2','group','v1','v2')
df1['comboKey1'] = df1['key1'].map(str)+"_"+df1['key2']
import pandas as pd
# Reading file, 'r' -> read
file = open('data.txt', 'r')
lines = file.readlines()
# Fict where info will be stored
main_dict = {}
for line in lines:
# Getting the list of values in the line
# values -> [key1, key2, group, v1, v2, v3, v4]
# indexs -> 0 1 2 3 4 5 6
values = line.split('|')
#creating combo_key
combo_key = str(values[0])+"_"+str(values[1])
#tests if key already exists
#if not, creats a new dict into it
if combo_key not in main_dict.keys():
main_dict[combo_key] = {} #adding new dict to dict key
main_dict[combo_key][str(values[2])] = [values[3], values[4]]
data = []
for key in main_dict.keys():
data.append([key, str(main_dict[key])])
df = pd.DataFrame(data, columns = ['ComboKey1', "new_v1"])
print(df)
直接对dict进行排序,然后(:
如果只是达到想要的预期输出,那么下面的代码也可以应用。
import pandas as pd
from io import StringIO
YOUR_TXT_DATA = """\
1|A|-1|10|100|1|2
1|A|2|20|35|2|3
1|B|1|15|5|3|5
2|B|5|23|25|4|2
2|B|2|33|20|22|98
2|D|4|23|21|20|32
"""
df = pd.read_csv(StringIO(YOUR_TXT_DATA), header=None,
usecols=[_ for _ in range(0, 5)],
names=['key1', 'key2', 'group', 'v1', 'v2'],
sep='|')
result_dict = dict(comboKey1=[], new_v1=[])
for key1, key2, group, v1, v2 in df.values:
key = str(key1) + '_' + str(key2)
if key not in result_dict['comboKey1']:
result_dict['comboKey1'].append(key)
result_dict['new_v1'].append({str(group): [v1, v2]})
else:
index = result_dict['comboKey1'].index(key)
result_dict['new_v1'][index].update({str(group): [v1, v2]})
result_df = pd.DataFrame.from_dict(result_dict)
print(result_df)
输出
comboKey1 new_v1
0 1_A {'-1': [10, 100], '2': [20, 35]}
1 1_B {'1': [15, 5]}
2 2_B {'5': [23, 25], '2': [33, 20]}
3 2_D {'4': [23, 21]}
关于测试数据
我认为有一些特殊情况需要你考虑,假设数据如下。
key1|key2|group|v1|v2|v3|v4
1|A|-1|10|100|1|2
1|A|-1|10|100|1|2
1|A|-1|20|35|2|3
您的预期输出是多少? (案例1~3)
- 情况一:以最后为准。
1_A {'-1': [20, 35]}
(解决方案:字典) - 情况 2:保留所有但不重复:
{('-1', (10, 100)), ('-1', (20, 35))}
(解决方案:设置) - 情况 3:保留所有
1_A [('-1', (10, 100)), ('-1', (10, 100)), ('-1', (20, 35))]
(解决方案:列表)
代码:
from unittest import TestCase
import pandas as pd
from io import StringIO
OTHER_TXT_DATA = """\
1|A|-1|10|100|1|2
1|A|-1|10|100|1|2
1|A|-1|20|35|2|3
"""
class MyTests(TestCase):
def __init__(self, *args, **options):
super().__init__(*args, **options)
self.df = pd.read_csv(StringIO(OTHER_TXT_DATA), header=None,
usecols=[_ for _ in range(0, 5)],
names=['key1', 'key2', 'group', 'v1', 'v2'],
sep='|')
def setUp(self) -> None:
# init on every test case.
self.result_dict = dict(comboKey1=[], new_v1=[])
def solution_base(self, new_v1_fun, update_v1_fun) -> pd.DataFrame:
result_dict = self.result_dict
for key1, key2, group, v1, v2 in self.df.values:
key = str(key1) + '_' + str(key2)
if key not in result_dict['comboKey1']:
result_dict['comboKey1'].append(key)
new_v1_fun(group, v1, v2) # result_dict['new_v1'].append({str(group): [v1, v2]})
else:
index = result_dict['comboKey1'].index(key)
update_v1_fun(index, group, v1, v2) # result_dict['new_v1'][index].update({str(group): [v1, v2]})
df = pd.DataFrame.from_dict(result_dict)
print(df)
return df
def test_case_1_dict(self):
df = self.solution_base(new_v1_fun=lambda group, v1, v2: self.result_dict['new_v1'].append({str(group): [v1, v2]}),
update_v1_fun=lambda index, group, v1, v2: self.result_dict['new_v1'][index].update({str(group): [v1, v2]}))
self.assertTrue(df.equals(pd.DataFrame(
columns=['comboKey1', 'new_v1'],
data=[
['1_A', {'-1': [20, 35]}],
]
)))
def test_case_2_set(self):
df = self.solution_base(new_v1_fun=lambda group, v1, v2: self.result_dict['new_v1'].append({(str(group), (v1, v2))}),
update_v1_fun=lambda index, group, v1, v2: self.result_dict['new_v1'][index].add((str(group), (v1, v2))))
self.assertTrue(df.equals(pd.DataFrame(
columns=['comboKey1', 'new_v1'],
data=[
['1_A', {('-1', (20, 35)), ('-1', (10, 100))}],
]
)))
def test_case_3_list(self):
df = self.solution_base(new_v1_fun=lambda group, v1, v2: self.result_dict['new_v1'].append([(str(group), (v1, v2))]),
update_v1_fun=lambda index, group, v1, v2: self.result_dict['new_v1'][index].append((str(group), (v1, v2))))
self.assertTrue(df.equals(pd.DataFrame(
columns=['comboKey1', 'new_v1'],
data=[
['1_A', [('-1', (10, 100)), ('-1', (10, 100)), ('-1', (20, 35))]],
]
)))
注意:不支持注释(见PEP484)Python 2.