如何将 Twitch IRC 响应中的表情解析为字典列表?
How can I parse the emotes out of a Twitch IRC response into an list of dictionaries?
我想将来自 Twitch 的 IRC 消息解析为字典列表,并考虑表情。
这是我可以从 Twitch 获得的示例:
"Testing. :) Confirmed!"
{"emotes": [(1, (9, 10))]}
说明第9到10个字符(字符串从零开始)有ID为1的表情。
我希望我的数据采用以下格式:
[
{
"type": "text",
"text": "Testing. "
},
{
"type": "emote",
"text": ":)",
"id": 1
},
{
"type": "text",
"text": " Confirmed!"
}
]
是否有相对简洁的方法来完成此操作?
我找到了一个解决方案,虽然很丑陋,但有效。
import re
packet_expression = re.compile(r'(@.+)? :([a-zA-Z0-9][\w]{2,23})!@.tmi.twitch.tv PRIVMSG #([a-zA-Z0-9][\w]{2,23}) :(.+)')
def parse_twitch(packet):
match = re.match(packet_expression, packet)
items = match.group(1)[1:].split(';')
tags = dict(item.split('=') for item in items)
emote_expression = re.compile(r'(\d+):((\d+-\d+,)*\d+-\d+)')
tags["emotes"] = [
(int(emotes[0]), (int(start), int(end)))
for emotes in re.findall(emote_expression, tags.get("emotes", ''))
for location in emotes[1].split(',')
for start, end in [location.split('-')]
]
message = match.group(4)
characters = list(message)
offset = 0
for emote in tags["emotes"]:
characters[emote[1][0]-offset : emote[1][1]-offset+1] = [{
"type": "emote",
"text": ''.join(characters[emote[1][0]-offset : emote[1][1]-offset+1]),
"id": emote[0]
}]
offset += emote[1][1] - emote[1][0]
index = 0
while any(isinstance(item, str) for item in characters):
if isinstance(characters[index], str) and isinstance(characters[index+1], str):
characters[index:index+2] = [characters[index] + characters[index+1]]
else:
if isinstance(characters[index], str):
characters[index] = {"type": "text", "text": characters[index]}
index += 1
return characters
我不确定你收到的消息是不是这样的:
message = '''\
"Testing. :) Confirmed!"
{"emotes": [(1, (9, 10))]}'''
或
text = "Testing. :) Confirmed!"
meta = '{"emotes": [(1, (9, 10))]}'
我假设是后者,因为从前者转换到后者很容易。也可能是 python 表示。你说的不是很清楚
有一种非常更好的方法来解决这个问题,不使用正则表达式而只使用字符串解析:
import json
text = 'Testing. :) Confirmed! :P'
print(len(text))
meta = '{"emotes": [(1, (9, 10)), (2, (23,25))]}'
meta = json.loads(meta.replace('(', '[').replace(')', ']'))
results = []
cur_index = 0
for emote in meta['emotes']:
results.append({'type': 'text', 'text': text[cur_index:emote[1][0]]})
results.append({'type': 'emote', 'text': text[emote[1][0]:emote[1][1]+1],
'id': emote[0]})
cur_index = emote[1][1]+1
if text[cur_index:]:
results.append({'type': 'text', 'text': text[cur_index:]})
import pprint; pprint.pprint(results)
根据您的评论,数据采用自定义格式。我从评论中 copy/pasted 不确定 实际上 出现在传入数据中的几个字符,我希望我没弄错。消息中也只有一种表情,所以我不完全确定它如何表示多种不同的表情类型——我希望有一些分隔符而不是多个 emote=
部分,或者这种方法需要一些重修改,但这 应该 提供解析而不需要正则表达式。
from collections import namedtuple
Emote = namedtuple('Emote', ('id', 'start', 'end'))
def parse_emotes(raw):
emotes = []
for raw_emote in raw.split('/'):
id, locations = raw_emote.split(':')
id = int(id)
locations = [location.split('-')
for location in locations.split(',')]
for location in locations:
emote = Emote(id=id, start=int(location[0]), end=int(location[1]))
emotes.append(emote)
return emotes
data = r'@badges=moderator/1;color=#0000FF;display-name=2Cubed;emotes=25:6-10,12-16;id=05aada01-f8c1-4b2e-a5be-2534096057b9;mod=1;room-id=82607708;subscriber=0;turbo=0;user-id=54561464;user-type=mod:2cubed!2cubed@2cubed.tmi.twitch.tv PRIVMSG #innectic :Hiya! Kappa Kappa'
meta, msgtype, channel, message = data.split(' ', maxsplit=3)
meta = dict(tag.split('=') for tag in meta.split(';'))
meta['emotes'] = parse_emotes(meta['emotes'])
我想将来自 Twitch 的 IRC 消息解析为字典列表,并考虑表情。
这是我可以从 Twitch 获得的示例:
"Testing. :) Confirmed!"
{"emotes": [(1, (9, 10))]}
说明第9到10个字符(字符串从零开始)有ID为1的表情。
我希望我的数据采用以下格式:
[
{
"type": "text",
"text": "Testing. "
},
{
"type": "emote",
"text": ":)",
"id": 1
},
{
"type": "text",
"text": " Confirmed!"
}
]
是否有相对简洁的方法来完成此操作?
我找到了一个解决方案,虽然很丑陋,但有效。
import re
packet_expression = re.compile(r'(@.+)? :([a-zA-Z0-9][\w]{2,23})!@.tmi.twitch.tv PRIVMSG #([a-zA-Z0-9][\w]{2,23}) :(.+)')
def parse_twitch(packet):
match = re.match(packet_expression, packet)
items = match.group(1)[1:].split(';')
tags = dict(item.split('=') for item in items)
emote_expression = re.compile(r'(\d+):((\d+-\d+,)*\d+-\d+)')
tags["emotes"] = [
(int(emotes[0]), (int(start), int(end)))
for emotes in re.findall(emote_expression, tags.get("emotes", ''))
for location in emotes[1].split(',')
for start, end in [location.split('-')]
]
message = match.group(4)
characters = list(message)
offset = 0
for emote in tags["emotes"]:
characters[emote[1][0]-offset : emote[1][1]-offset+1] = [{
"type": "emote",
"text": ''.join(characters[emote[1][0]-offset : emote[1][1]-offset+1]),
"id": emote[0]
}]
offset += emote[1][1] - emote[1][0]
index = 0
while any(isinstance(item, str) for item in characters):
if isinstance(characters[index], str) and isinstance(characters[index+1], str):
characters[index:index+2] = [characters[index] + characters[index+1]]
else:
if isinstance(characters[index], str):
characters[index] = {"type": "text", "text": characters[index]}
index += 1
return characters
我不确定你收到的消息是不是这样的:
message = '''\
"Testing. :) Confirmed!"
{"emotes": [(1, (9, 10))]}'''
或
text = "Testing. :) Confirmed!"
meta = '{"emotes": [(1, (9, 10))]}'
我假设是后者,因为从前者转换到后者很容易。也可能是 python 表示。你说的不是很清楚
有一种非常更好的方法来解决这个问题,不使用正则表达式而只使用字符串解析:
import json
text = 'Testing. :) Confirmed! :P'
print(len(text))
meta = '{"emotes": [(1, (9, 10)), (2, (23,25))]}'
meta = json.loads(meta.replace('(', '[').replace(')', ']'))
results = []
cur_index = 0
for emote in meta['emotes']:
results.append({'type': 'text', 'text': text[cur_index:emote[1][0]]})
results.append({'type': 'emote', 'text': text[emote[1][0]:emote[1][1]+1],
'id': emote[0]})
cur_index = emote[1][1]+1
if text[cur_index:]:
results.append({'type': 'text', 'text': text[cur_index:]})
import pprint; pprint.pprint(results)
根据您的评论,数据采用自定义格式。我从评论中 copy/pasted 不确定 实际上 出现在传入数据中的几个字符,我希望我没弄错。消息中也只有一种表情,所以我不完全确定它如何表示多种不同的表情类型——我希望有一些分隔符而不是多个 emote=
部分,或者这种方法需要一些重修改,但这 应该 提供解析而不需要正则表达式。
from collections import namedtuple
Emote = namedtuple('Emote', ('id', 'start', 'end'))
def parse_emotes(raw):
emotes = []
for raw_emote in raw.split('/'):
id, locations = raw_emote.split(':')
id = int(id)
locations = [location.split('-')
for location in locations.split(',')]
for location in locations:
emote = Emote(id=id, start=int(location[0]), end=int(location[1]))
emotes.append(emote)
return emotes
data = r'@badges=moderator/1;color=#0000FF;display-name=2Cubed;emotes=25:6-10,12-16;id=05aada01-f8c1-4b2e-a5be-2534096057b9;mod=1;room-id=82607708;subscriber=0;turbo=0;user-id=54561464;user-type=mod:2cubed!2cubed@2cubed.tmi.twitch.tv PRIVMSG #innectic :Hiya! Kappa Kappa'
meta, msgtype, channel, message = data.split(' ', maxsplit=3)
meta = dict(tag.split('=') for tag in meta.split(';'))
meta['emotes'] = parse_emotes(meta['emotes'])