Python 将计数器转换为 DataFrame 列
Python Convert Counters into DataFrame Columns
我无法在这里找到特定于我的问题的答案,我想知道我是否可以获得一些帮助(抱歉链接,我还不允许嵌入图像)。
我已将 Counter 对象存储在我的 DataFrame 中,并且还希望将它们作为每个计数元素的列添加到 DataFrame 中。
起始数据
data = {
"words": ["ABC", "BCDB", "CDE", "F"],
"stuff": ["abc", "bcda", "cde", "f"]
}
df = pd.DataFrame(data)
Preliminary Data Frame
patternData = {
"name": ["A", "B", "C", "D", "E", "F"],
"rex": ["A{1}", "B{1}", "C{1}", "D{1}", "E{1}", "F{1}"]
}
patterns = pd.DataFrame(patternData)
Pattern DataFrame
def countFound(ps):
result = Counter()
for index, row in patterns.iterrows():
findName = row['name']
findRex = row['rex']
found = re.findall(findRex, ps)
if (len(found) > 0):
result.update({findName:len(found)})
return result
df['found'] = df['words'].apply(lambda x: countFound(x))
Found DataFrame
Desired Results
words
stuff
found
A
B
C
D
E
F
ABC
acb
{'A': 1, 'B': 1, 'C': 1}
1
1
1
0
0
0
BCD
bcd
{'B': 1, 'C': 1, 'D': 1}
0
2
1
1
0
0
CDE
cde
{'C': 1, 'D': 1, 'E': 1}
0
0
1
1
1
0
F
f
{'F': 1}
0
0
0
0
0
1
A Counter
的行为很像字典。在字典列表上调用 pd.DataFrame
将为您提供计数值矩阵:
found = df['words'].apply(countFound).to_list()
pd.concat([
df.assign(found=found),
pd.DataFrame(found).fillna(0).astype("int")
], axis=1)
您可以使用 json_normalize
:
df.join(pd.json_normalize(df['found']).fillna(0, downcast='infer'))
输出:
words stuff found A B C D E F
0 ABC abc {'A': 1, 'B': 1, 'C': 1} 1 1 1 0 0 0
1 BCDB bcda {'B': 2, 'C': 1, 'D': 1} 0 2 1 1 0 0
2 CDE cde {'C': 1, 'D': 1, 'E': 1} 0 0 1 1 1 0
3 F f {'F': 1} 0 0 0 0 0 1
您也可以不使用自定义函数直接获取列。为此,请使用带有命名捕获组和 str.extractall
:
的动态制作的正则表达式
regex = ('(?P<'+patterns['name']+'>'+patterns['rex']+')').str.cat(sep='|')
# (?P<A>A{1})|(?P<B>B{1})|(?P<C>C{1})|(?P<D>D{1})|(?P<E>E{1})|(?P<F>F{1})
df2 = df.join(df
['words']
.str.extractall(regex)
.groupby(level=0).count()
)
或者没有命名捕获组的变体,稍后设置列名:
regex = ('('+patterns['rex']+')').str.cat(sep='|')
# (A{1})|(B{1})|(C{1})|(D{1})|(E{1})|(F{1})
print(df.join(df
['words']
.str.extractall(regex)
.set_axis(patterns['name'], axis=1)
.groupby(level=0).count()
))
输出:
words stuff A B C D E F
0 ABC abc 1 1 1 0 0 0
1 BCDB bcda 0 2 1 1 0 0
2 CDE cde 0 0 1 1 1 0
3 F f 0 0 0 0 0 1
我无法在这里找到特定于我的问题的答案,我想知道我是否可以获得一些帮助(抱歉链接,我还不允许嵌入图像)。
我已将 Counter 对象存储在我的 DataFrame 中,并且还希望将它们作为每个计数元素的列添加到 DataFrame 中。
起始数据
data = {
"words": ["ABC", "BCDB", "CDE", "F"],
"stuff": ["abc", "bcda", "cde", "f"]
}
df = pd.DataFrame(data)
Preliminary Data Frame
patternData = {
"name": ["A", "B", "C", "D", "E", "F"],
"rex": ["A{1}", "B{1}", "C{1}", "D{1}", "E{1}", "F{1}"]
}
patterns = pd.DataFrame(patternData)
Pattern DataFrame
def countFound(ps):
result = Counter()
for index, row in patterns.iterrows():
findName = row['name']
findRex = row['rex']
found = re.findall(findRex, ps)
if (len(found) > 0):
result.update({findName:len(found)})
return result
df['found'] = df['words'].apply(lambda x: countFound(x))
Found DataFrame
Desired Results
words | stuff | found | A | B | C | D | E | F |
---|---|---|---|---|---|---|---|---|
ABC | acb | {'A': 1, 'B': 1, 'C': 1} |
1 | 1 | 1 | 0 | 0 | 0 |
BCD | bcd | {'B': 1, 'C': 1, 'D': 1} |
0 | 2 | 1 | 1 | 0 | 0 |
CDE | cde | {'C': 1, 'D': 1, 'E': 1} |
0 | 0 | 1 | 1 | 1 | 0 |
F | f | {'F': 1} |
0 | 0 | 0 | 0 | 0 | 1 |
A Counter
的行为很像字典。在字典列表上调用 pd.DataFrame
将为您提供计数值矩阵:
found = df['words'].apply(countFound).to_list()
pd.concat([
df.assign(found=found),
pd.DataFrame(found).fillna(0).astype("int")
], axis=1)
您可以使用 json_normalize
:
df.join(pd.json_normalize(df['found']).fillna(0, downcast='infer'))
输出:
words stuff found A B C D E F
0 ABC abc {'A': 1, 'B': 1, 'C': 1} 1 1 1 0 0 0
1 BCDB bcda {'B': 2, 'C': 1, 'D': 1} 0 2 1 1 0 0
2 CDE cde {'C': 1, 'D': 1, 'E': 1} 0 0 1 1 1 0
3 F f {'F': 1} 0 0 0 0 0 1
您也可以不使用自定义函数直接获取列。为此,请使用带有命名捕获组和 str.extractall
:
regex = ('(?P<'+patterns['name']+'>'+patterns['rex']+')').str.cat(sep='|')
# (?P<A>A{1})|(?P<B>B{1})|(?P<C>C{1})|(?P<D>D{1})|(?P<E>E{1})|(?P<F>F{1})
df2 = df.join(df
['words']
.str.extractall(regex)
.groupby(level=0).count()
)
或者没有命名捕获组的变体,稍后设置列名:
regex = ('('+patterns['rex']+')').str.cat(sep='|')
# (A{1})|(B{1})|(C{1})|(D{1})|(E{1})|(F{1})
print(df.join(df
['words']
.str.extractall(regex)
.set_axis(patterns['name'], axis=1)
.groupby(level=0).count()
))
输出:
words stuff A B C D E F
0 ABC abc 1 1 1 0 0 0
1 BCDB bcda 0 2 1 1 0 0
2 CDE cde 0 0 1 1 1 0
3 F f 0 0 0 0 0 1