如何计算字典中单词的长度
How to count the length of words in a dictionary
我有这样一个字典列表:
myList = [
{
'id':1,
'text':[
'I like cheese.',
'I love cheese.',
'oh!'
],
'text_2': [
('david', 'david', 'I do not like cheese.'),
('david', 'david', 'cheese is good.')
]
},
{
'id':2,
'text':[
'I like strawberry.',
'I love strawberry'
],
'text_2':[
('alice', 'alice', 'strawberry is good.'),
('alice', 'alice', ' strawberry is so so.')
]
}
]
我想通过“id”统计“text”和“text_2”的元素个数和长度。理想的输出是:
myList = [
{
'id':1,
'text':(3,7),
'text_2': (2,8)
},
{
'id':2,
'text':(2,6),
'text_2':(2,7)
}
]
'text':(3,7) 表示:3 个元素('I like cheese.'、'I love cheese.'、'oh!'); 7个字(我,喜欢,奶酪,我爱奶酪,哦)
'text_2':(2,8)表示:2个元素(('david','david','I do not like cheese.'),('david','david', 'cheese is good.')); 8个字(I, do, not, like, cheese, cheese, is good))
有什么建议吗?
比如像这样:
from itertools import chain
from string import punctuation
def remove_punctuation(text):
return "".join(filter(lambda x: x not in punctuation, text))
def count_items_and_words(items, label):
items_cnt = len(items)
if label == "text":
total_text = " ".join(items)
elif label == "text_2":
total_text = " ".join(chain(*[it[2:] for it in items]))
total_text_clean = remove_punctuation(total_text)
words_cnt = len(total_text_clean.split())
return (items_cnt, words_cnt)
def count_all(my_list):
results = list()
for it in my_list:
if not isinstance(it, dict):
continue
res = {"id": it["id"]}
for label in "text", "text_2":
res[label] = count_items_and_words(it[label], label)
results.append(res)
return results
results = count_all(myList)
results
输出:
[{'id': 1, 'text': (3, 7), 'text_2': (2, 8)},
{'id': 2, 'text': (2, 6), 'text_2': (2, 7)}]
见下文
lst = [
{
'id': 1,
'text': [
'I like cheese.',
'I love cheese.',
'oh!'
],
'text_2': [
('david', 'david', 'I do not like cheese.'),
('david', 'david', 'cheese is good.')
]
},
{
'id': 2,
'text': [
'I like strawberry.',
'I love strawberry'
],
'text_2': [
('alice', 'alice', 'strawberry is good.'),
('alice', 'alice', ' strawberry is so so.')
]
}
]
out = []
for entry in lst:
out.append({})
for k, v in entry.items():
if k == 'id':
out[-1][k] = v
elif k == 'text':
out[-1][k] = (len(v), sum(len(x.split()) for x in v))
else:
out[-1][k] = (len(v),sum(len(x) for x in v))
print(out)
输出
[{'id': 1, 'text': (3, 7), 'text_2': (2, 6)}, {'id': 2, 'text': (2, 6), 'text_2': (2, 6)}]
如果你是新手,我的回答很难消化,但我希望你能找到一些对你未来有用的好组合......也因为你没有提供任何尝试。
' '.join(my_list)
将列表的元素组成一串由白色分隔的字符串-space
my_string.split()
通过切割单个白色-space 来从字符串中创建一个列表(-> 这样你就可以数了)
set(my_list)
删除元素的多次出现
itertools.chain
连接可迭代对象的函数,将列表中的元组合并为单个对象
- 列表理解,例如
[i for i in range(10) if i > 5]
由于您没有指定任何关于如何处理同一元素多次出现的规则,我只计算一次(因此 'david','david' 被计算为 1)
我对你的建议请求的回答是分而治之,把一个大问题分成小问题,解决它们,把它们粘在一起。
import itertools as it
myList = # see dictionary in the question
for d in myList:
for k, v in d.items():
if isinstance(v, list):
pair = len(v), len(' '.join(v).split()) if isinstance(v[0], str) else len(' '.join([t for t in set(it.chain(*v))]).split())
print(pair)
else:
print(k, v)
输出
id 1
(3, 7)
(2, 9)
id 2
(2, 6)
(2, 8)
我有这样一个字典列表:
myList = [
{
'id':1,
'text':[
'I like cheese.',
'I love cheese.',
'oh!'
],
'text_2': [
('david', 'david', 'I do not like cheese.'),
('david', 'david', 'cheese is good.')
]
},
{
'id':2,
'text':[
'I like strawberry.',
'I love strawberry'
],
'text_2':[
('alice', 'alice', 'strawberry is good.'),
('alice', 'alice', ' strawberry is so so.')
]
}
]
我想通过“id”统计“text”和“text_2”的元素个数和长度。理想的输出是:
myList = [
{
'id':1,
'text':(3,7),
'text_2': (2,8)
},
{
'id':2,
'text':(2,6),
'text_2':(2,7)
}
]
'text':(3,7) 表示:3 个元素('I like cheese.'、'I love cheese.'、'oh!'); 7个字(我,喜欢,奶酪,我爱奶酪,哦)
'text_2':(2,8)表示:2个元素(('david','david','I do not like cheese.'),('david','david', 'cheese is good.')); 8个字(I, do, not, like, cheese, cheese, is good))
有什么建议吗?
比如像这样:
from itertools import chain
from string import punctuation
def remove_punctuation(text):
return "".join(filter(lambda x: x not in punctuation, text))
def count_items_and_words(items, label):
items_cnt = len(items)
if label == "text":
total_text = " ".join(items)
elif label == "text_2":
total_text = " ".join(chain(*[it[2:] for it in items]))
total_text_clean = remove_punctuation(total_text)
words_cnt = len(total_text_clean.split())
return (items_cnt, words_cnt)
def count_all(my_list):
results = list()
for it in my_list:
if not isinstance(it, dict):
continue
res = {"id": it["id"]}
for label in "text", "text_2":
res[label] = count_items_and_words(it[label], label)
results.append(res)
return results
results = count_all(myList)
results
输出:
[{'id': 1, 'text': (3, 7), 'text_2': (2, 8)},
{'id': 2, 'text': (2, 6), 'text_2': (2, 7)}]
见下文
lst = [
{
'id': 1,
'text': [
'I like cheese.',
'I love cheese.',
'oh!'
],
'text_2': [
('david', 'david', 'I do not like cheese.'),
('david', 'david', 'cheese is good.')
]
},
{
'id': 2,
'text': [
'I like strawberry.',
'I love strawberry'
],
'text_2': [
('alice', 'alice', 'strawberry is good.'),
('alice', 'alice', ' strawberry is so so.')
]
}
]
out = []
for entry in lst:
out.append({})
for k, v in entry.items():
if k == 'id':
out[-1][k] = v
elif k == 'text':
out[-1][k] = (len(v), sum(len(x.split()) for x in v))
else:
out[-1][k] = (len(v),sum(len(x) for x in v))
print(out)
输出
[{'id': 1, 'text': (3, 7), 'text_2': (2, 6)}, {'id': 2, 'text': (2, 6), 'text_2': (2, 6)}]
如果你是新手,我的回答很难消化,但我希望你能找到一些对你未来有用的好组合......也因为你没有提供任何尝试。
' '.join(my_list)
将列表的元素组成一串由白色分隔的字符串-spacemy_string.split()
通过切割单个白色-space 来从字符串中创建一个列表(-> 这样你就可以数了)set(my_list)
删除元素的多次出现itertools.chain
连接可迭代对象的函数,将列表中的元组合并为单个对象- 列表理解,例如
[i for i in range(10) if i > 5]
由于您没有指定任何关于如何处理同一元素多次出现的规则,我只计算一次(因此 'david','david' 被计算为 1)
我对你的建议请求的回答是分而治之,把一个大问题分成小问题,解决它们,把它们粘在一起。
import itertools as it
myList = # see dictionary in the question
for d in myList:
for k, v in d.items():
if isinstance(v, list):
pair = len(v), len(' '.join(v).split()) if isinstance(v[0], str) else len(' '.join([t for t in set(it.chain(*v))]).split())
print(pair)
else:
print(k, v)
输出
id 1
(3, 7)
(2, 9)
id 2
(2, 6)
(2, 8)