如何从字符串中提取文本和表情符号?
How to extract text and emojis from a string?
我正在尝试分隔文本字符串。我在尝试实现这一目标时遇到了一些问题。下面是我试过的。
import emoji
text = "#samplesenti @emojitweets i ❤❤❤ sentiment " analysis " http://senti.com/pic_01.jpg "
def extract_text_and_emoji(text=text):
global allchars, emoji_list
# remove all tagging and links, not need for sentiments
remove_keys = ('@', 'http://', '&', '#')
clean_text = ' '.join(txt for txt in text.split() if not txt.startswith(remove_keys))
# print(clean_text)
# setup the input, get the characters and the emoji lists
allchars = [str for str in text]
emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
# extract text
clean_text = ' '.join([str for str in clean_text.split() if not any(i in str for i in emoji_list)])
# extract emoji
clean_emoji = ''.join([str for str in text.split() if any(i in str for i in emoji_list)])
return (clean_text, clean_emoji)
allchars, emoji_list = 0, 0
(clean_text, clean_emoji) = extract_text_and_emoji()
print('\nAll Char:', allchars)
print('\nAll Emoji:', emoji_list)
print('\n', clean_text)
print('\n', clean_emoji)
我希望将它发送到我的控制台:
All Char: ['#', 's', 'a', 'm', 'p', 'l', 'e', 's', 'e', 'n', 't', 'i', ' ', '@', 'e', 'm', 'o', 'j', 'i', 't', 'w', 'e', 'e', 't', 's', ' ', 'i', ' ', '❤', '❤', '❤', ' ', 's', 'e', 'n', 't', 'i', 'm', 'e', 'n', 't', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'a', 'n', 'a', 'l', 'y', 's', 'i', 's', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'h', 't', 't', 'p', ':', '/', '/', 's', 'e', 'n', 't', 'i', '.', 'c', 'o', 'm', '/', 'p', 'i', 'c', '_', '0', '1', '.', 'j', 'p', 'g', ' ']
All Emoji: ['❤', '❤', '❤']
i sentiment analysis
❤❤❤
但我得到的是这个:
All Char: ['#', 's', 'a', 'm', 'p', 'l', 'e', 's', 'e', 'n', 't', 'i', ' ', '@', 'e', 'm', 'o', 'j', 'i', 't', 'w', 'e', 'e', 't', 's', ' ', 'i', ' ', '❤', '❤', '❤', ' ', 's', 'e', 'n', 't', 'i', 'm', 'e', 'n', 't', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'a', 'n', 'a', 'l', 'y', 's', 'i', 's', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'h', 't', 't', 'p', ':', '/', '/', 's', 'e', 'n', 't', 'i', '.', 'c', 'o', 'm', '/', 'p', 'i', 'c', '_', '0', '1', '.', 'j', 'p', 'g', ' ']
All Emoji: []
i ❤❤❤ sentiment analysis
给你:
import re
text = "#samplesenti @emojitweets i ❤❤❤ sentiment " analysis " http://senti.com/pic_01.jpg "
matching_emoji_indices = [re.match("^[# @:;./&A-Za-z0-9_-]*$", t) == None for t in text]
matching_text_indices = [re.match("^[A-Za-z0-9-]*$", t) == None for t in text]
text_list = [t for i, t in enumerate(text) if not matching_text_indices[i]]
emoji_list = [t for i, t in enumerate(text) if matching_emoji_indices[i]]
print(text_list)
print(emoji_list)
将['en']
添加到emoji.UNICODE_EMOJI
:
import emoji
text = "#samplesenti @emojitweets i ❤❤❤ sentiment " analysis " http://senti.com/pic_01.jpg "
def extract_text_and_emoji(text=text):
global allchars, emoji_list
# remove all tagging and links, not need for sentiments
remove_keys = ("@", "http://", "&", "#")
clean_text = " ".join(
txt for txt in text.split() if not txt.startswith(remove_keys)
)
# print(clean_text)
# setup the input, get the characters and the emoji lists
allchars = [str for str in text]
emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI["en"]] # <-- HERE!
# extract text
clean_text = " ".join(
[
str
for str in clean_text.split()
if not any(i in str for i in emoji_list)
]
)
# extract emoji
clean_emoji = "".join(
[str for str in text.split() if any(i in str for i in emoji_list)]
)
return (clean_text, clean_emoji)
allchars, emoji_list = 0, 0
(clean_text, clean_emoji) = extract_text_and_emoji()
print("\nAll Char:", allchars)
print("\nAll Emoji:", emoji_list)
print("\n", clean_text)
print("\n", clean_emoji)
打印:
All Char: ['#', 's', 'a', 'm', 'p', 'l', 'e', 's', 'e', 'n', 't', 'i', ' ', '@', 'e', 'm', 'o', 'j', 'i', 't', 'w', 'e', 'e', 't', 's', ' ', 'i', ' ', '❤', '❤', '❤', ' ', 's', 'e', 'n', 't', 'i', 'm', 'e', 'n', 't', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'a', 'n', 'a', 'l', 'y', 's', 'i', 's', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'h', 't', 't', 'p', ':', '/', '/', 's', 'e', 'n', 't', 'i', '.', 'c', 'o', 'm', '/', 'p', 'i', 'c', '_', '0', '1', '.', 'j', 'p', 'g', ' ']
All Emoji: ['❤', '❤', '❤']
i sentiment analysis
❤❤❤
我正在尝试分隔文本字符串。我在尝试实现这一目标时遇到了一些问题。下面是我试过的。
import emoji
text = "#samplesenti @emojitweets i ❤❤❤ sentiment " analysis " http://senti.com/pic_01.jpg "
def extract_text_and_emoji(text=text):
global allchars, emoji_list
# remove all tagging and links, not need for sentiments
remove_keys = ('@', 'http://', '&', '#')
clean_text = ' '.join(txt for txt in text.split() if not txt.startswith(remove_keys))
# print(clean_text)
# setup the input, get the characters and the emoji lists
allchars = [str for str in text]
emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
# extract text
clean_text = ' '.join([str for str in clean_text.split() if not any(i in str for i in emoji_list)])
# extract emoji
clean_emoji = ''.join([str for str in text.split() if any(i in str for i in emoji_list)])
return (clean_text, clean_emoji)
allchars, emoji_list = 0, 0
(clean_text, clean_emoji) = extract_text_and_emoji()
print('\nAll Char:', allchars)
print('\nAll Emoji:', emoji_list)
print('\n', clean_text)
print('\n', clean_emoji)
我希望将它发送到我的控制台:
All Char: ['#', 's', 'a', 'm', 'p', 'l', 'e', 's', 'e', 'n', 't', 'i', ' ', '@', 'e', 'm', 'o', 'j', 'i', 't', 'w', 'e', 'e', 't', 's', ' ', 'i', ' ', '❤', '❤', '❤', ' ', 's', 'e', 'n', 't', 'i', 'm', 'e', 'n', 't', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'a', 'n', 'a', 'l', 'y', 's', 'i', 's', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'h', 't', 't', 'p', ':', '/', '/', 's', 'e', 'n', 't', 'i', '.', 'c', 'o', 'm', '/', 'p', 'i', 'c', '_', '0', '1', '.', 'j', 'p', 'g', ' ']
All Emoji: ['❤', '❤', '❤']
i sentiment analysis
❤❤❤
但我得到的是这个:
All Char: ['#', 's', 'a', 'm', 'p', 'l', 'e', 's', 'e', 'n', 't', 'i', ' ', '@', 'e', 'm', 'o', 'j', 'i', 't', 'w', 'e', 'e', 't', 's', ' ', 'i', ' ', '❤', '❤', '❤', ' ', 's', 'e', 'n', 't', 'i', 'm', 'e', 'n', 't', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'a', 'n', 'a', 'l', 'y', 's', 'i', 's', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'h', 't', 't', 'p', ':', '/', '/', 's', 'e', 'n', 't', 'i', '.', 'c', 'o', 'm', '/', 'p', 'i', 'c', '_', '0', '1', '.', 'j', 'p', 'g', ' ']
All Emoji: []
i ❤❤❤ sentiment analysis
给你:
import re
text = "#samplesenti @emojitweets i ❤❤❤ sentiment " analysis " http://senti.com/pic_01.jpg "
matching_emoji_indices = [re.match("^[# @:;./&A-Za-z0-9_-]*$", t) == None for t in text]
matching_text_indices = [re.match("^[A-Za-z0-9-]*$", t) == None for t in text]
text_list = [t for i, t in enumerate(text) if not matching_text_indices[i]]
emoji_list = [t for i, t in enumerate(text) if matching_emoji_indices[i]]
print(text_list)
print(emoji_list)
将['en']
添加到emoji.UNICODE_EMOJI
:
import emoji
text = "#samplesenti @emojitweets i ❤❤❤ sentiment " analysis " http://senti.com/pic_01.jpg "
def extract_text_and_emoji(text=text):
global allchars, emoji_list
# remove all tagging and links, not need for sentiments
remove_keys = ("@", "http://", "&", "#")
clean_text = " ".join(
txt for txt in text.split() if not txt.startswith(remove_keys)
)
# print(clean_text)
# setup the input, get the characters and the emoji lists
allchars = [str for str in text]
emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI["en"]] # <-- HERE!
# extract text
clean_text = " ".join(
[
str
for str in clean_text.split()
if not any(i in str for i in emoji_list)
]
)
# extract emoji
clean_emoji = "".join(
[str for str in text.split() if any(i in str for i in emoji_list)]
)
return (clean_text, clean_emoji)
allchars, emoji_list = 0, 0
(clean_text, clean_emoji) = extract_text_and_emoji()
print("\nAll Char:", allchars)
print("\nAll Emoji:", emoji_list)
print("\n", clean_text)
print("\n", clean_emoji)
打印:
All Char: ['#', 's', 'a', 'm', 'p', 'l', 'e', 's', 'e', 'n', 't', 'i', ' ', '@', 'e', 'm', 'o', 'j', 'i', 't', 'w', 'e', 'e', 't', 's', ' ', 'i', ' ', '❤', '❤', '❤', ' ', 's', 'e', 'n', 't', 'i', 'm', 'e', 'n', 't', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'a', 'n', 'a', 'l', 'y', 's', 'i', 's', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'h', 't', 't', 'p', ':', '/', '/', 's', 'e', 'n', 't', 'i', '.', 'c', 'o', 'm', '/', 'p', 'i', 'c', '_', '0', '1', '.', 'j', 'p', 'g', ' ']
All Emoji: ['❤', '❤', '❤']
i sentiment analysis
❤❤❤