我想在 Python 2.7 中的一个长字符串(段落)中提取一定数量的单词周围的给定单词
I want to extract a certain number of words surrounding a given word in a long string(paragraph) in Python 2.7
我正在尝试提取围绕给定单词的选定数量的单词。我将举例说明:
字符串="Education shall be directed to the full development of the human personality and to the strengthening of respect for human rights and fundamental freedoms."
1) 选择的词是 development 我需要得到它周围的 6 个词,并得到:[to, the, full, of, the, human]
2) 但是如果选择的单词在开头或者第二个位置我仍然需要得到 6 个单词,例如:
选择的词是 shall ,我应该得到:[Education, be, directed, to , the , full]
我应该使用 're' 模块。到目前为止我设法找到的是:
def search(text,n):
'''Searches for text, and retrieves n words either side of the text, which are retuned seperatly'''
word = r"\W*([\w]+)"
groups = re.search(r'{}\W*{}{}'.format(word*n,'place',word*n), text).groups()
return groups[:n],groups[n:]
但它只对第一种情况有帮助。有人可以帮我解决这个问题,我将非常感激。提前致谢!
有点棘手,可能会出现差一错误,但我认为这符合您的规格。我已经删除了标点符号,最好在发送字符串进行分析之前将其删除。我假设大小写不重要。
test_str = "Education shall be directed to the full development of the human personality and to the strengthening of respect for human rights and fundamental freedoms."
def get_surrounding_words(search_word, s, n_words):
words = s.lower().split(' ')
try:
i = words.index(search_word)
except ValueError:
return []
# Word is near start
if i < n_words/2:
words.pop(i)
return words[:n_words]
# Word is near end
elif i >= len(words) - n_words/2:
words.pop(i)
return words[-n_words:]
# Word is in middle
else:
words.pop(i)
return words[i-n_words/2:i+n_words/2]
def test(word):
print('{}: {}'.format(word, get_surrounding_words(word, test_str, 6)))
test('notfound')
test('development')
test('shall')
test('education')
test('fundamental')
test('for')
test('freedoms')
import sys, os
args = sys.argv[1:]
if len(args) != 2:
os.exit("Use with <string> <query>")
text = args[0]
query = args[1]
words = text.split()
op = []
left = 3
right = 3
try:
index = words.index(query)
if index <= left:
start = 0
else:
start = index - left
if start + left + right + 1 > len(words):
start = len(words) - left - right - 1
if start < 0:
start = 0
while len(op) < left + right and start < len(words):
if start != index:
op.append(words[start])
start += 1
except ValueError:
pass
print op
- 这是如何工作的?
- 找到字符串中的单词
- 看看我们是否可以根据索引生成左+右词
- 取左+右字数存入op
- 打印操作
这将提取文本中所有出现的目标词,上下文为:
import re
text = ("Education shall be directed to the full development of the human personality "
"and to the strengthening of respect for human rights and fundamental freedoms.")
def search(target, text, context=6):
# It's easier to use re.findall to split the string,
# as we get rid of the punctuation
words = re.findall(r'\w+', text)
matches = (i for (i,w) in enumerate(words) if w.lower() == target)
for index in matches:
if index < context //2:
yield words[0:context+1]
elif index > len(words) - context//2 - 1:
yield words[-(context+1):]
else:
yield words[index - context//2:index + context//2 + 1]
print(list(search('the', text)))
# [['be', 'directed', 'to', 'the', 'full', 'development', 'of'],
# ['full', 'development', 'of', 'the', 'human', 'personality', 'and'],
# ['personality', 'and', 'to', 'the', 'strengthening', 'of', 'respect']]
print(list(search('shall', text)))
# [['Education', 'shall', 'be', 'directed', 'to', 'the', 'full']]
print(list(search('freedoms', text)))
# [['respect', 'for', 'human', 'rights', 'and', 'fundamental', 'freedoms']]
解决您的问题的简单方法。先把所有的词分开,然后左右选词。
def custom_search(sentence, word, n):
given_string = sentence
given_word = word
total_required = n
word_list = given_string.strip().split(" ")
length_of_words = len(word_list)
output_list = []
given_word_position = word_list.index(given_word)
word_from_left = 0
word_from_right = 0
if given_word_position + 1 > total_required / 2:
word_from_left = total_required / 2
if given_word_position + 1 + (total_required / 2) <= length_of_words:
word_from_right = total_required / 2
else:
word_from_right = length_of_words - (given_word_position + 1)
remaining_words = (total_required / 2) - word_from_right
word_from_left += remaining_words
else:
word_from_right = total_required / 2
word_from_left = given_word_position
if word_from_left + word_from_right < total_required:
remaining_words = (total_required / 2) - word_from_left
word_from_right += remaining_words
required_words = []
for i in range(given_word_position - word_from_left, word_from_right +
given_word_position + 1):
if i != given_word_position:
required_words.append(word_list[i])
return required_words
sentence = "Education shall be directed to the full development of the human personality and to the strengthening of respect for human rights and fundamental freedoms."
custom_search(sentence, "shall", 6)
>>[Education, be, directed, to , the , full]
custom_search(sentence, "development", 6)
>>['to', 'the', 'full', 'of', 'the', 'human']
我认为这里不需要正则表达式。假设文本结构良好,只需将其拆分为一个单词数组,并编写几个 if-else 语句以确保它检索到必要数量的周围单词:
def search(text, word, n):
# text is the string you are searching
# word is the word you are looking for
# n is the TOTAL number of words you want surrounding the word
words = text.split(" ") # Create an array of words from the string
position = words.index(word) # Find the position of the desired word
distance_from_end = len(words) - position # How many words are after the word in the text
if position < n // 2 + n % 2: # If there aren't enough words before...
return words[:position], words[position + 1:n + 1]
elif distance_from_end < n // 2 + n % 2: # If there aren't enough words after...
return words[position - n + distance_from_end:position], words[position + 1:]
else: # Otherwise, extract an equal number of words from both sides (take from the right if odd)
return words[position - n // 2 - n % 2:position], words[position + 1:position + 1 + n//2]
string = "Education shall be directed to the full development of the human personality and to the \
strengthening of respect for human rights and fundamental freedoms."
print search(string, "shall", 6)
# >> (['Education'], ['be', 'directed', 'to', 'the', 'full'])
print search(string, "human", 5)
# >> (['development', 'of', 'the'], ['personality', 'and'])
在您的示例中,您没有在输出中包含目标词,因此我也将其保留在外。如果您想要包含目标词,只需将函数 returns 的两个数组组合起来(在 position
加入它们)。
希望对您有所帮助!
我正在尝试提取围绕给定单词的选定数量的单词。我将举例说明:
字符串="Education shall be directed to the full development of the human personality and to the strengthening of respect for human rights and fundamental freedoms."
1) 选择的词是 development 我需要得到它周围的 6 个词,并得到:[to, the, full, of, the, human]
2) 但是如果选择的单词在开头或者第二个位置我仍然需要得到 6 个单词,例如:
选择的词是 shall ,我应该得到:[Education, be, directed, to , the , full]
我应该使用 're' 模块。到目前为止我设法找到的是:
def search(text,n):
'''Searches for text, and retrieves n words either side of the text, which are retuned seperatly'''
word = r"\W*([\w]+)"
groups = re.search(r'{}\W*{}{}'.format(word*n,'place',word*n), text).groups()
return groups[:n],groups[n:]
但它只对第一种情况有帮助。有人可以帮我解决这个问题,我将非常感激。提前致谢!
有点棘手,可能会出现差一错误,但我认为这符合您的规格。我已经删除了标点符号,最好在发送字符串进行分析之前将其删除。我假设大小写不重要。
test_str = "Education shall be directed to the full development of the human personality and to the strengthening of respect for human rights and fundamental freedoms."
def get_surrounding_words(search_word, s, n_words):
words = s.lower().split(' ')
try:
i = words.index(search_word)
except ValueError:
return []
# Word is near start
if i < n_words/2:
words.pop(i)
return words[:n_words]
# Word is near end
elif i >= len(words) - n_words/2:
words.pop(i)
return words[-n_words:]
# Word is in middle
else:
words.pop(i)
return words[i-n_words/2:i+n_words/2]
def test(word):
print('{}: {}'.format(word, get_surrounding_words(word, test_str, 6)))
test('notfound')
test('development')
test('shall')
test('education')
test('fundamental')
test('for')
test('freedoms')
import sys, os
args = sys.argv[1:]
if len(args) != 2:
os.exit("Use with <string> <query>")
text = args[0]
query = args[1]
words = text.split()
op = []
left = 3
right = 3
try:
index = words.index(query)
if index <= left:
start = 0
else:
start = index - left
if start + left + right + 1 > len(words):
start = len(words) - left - right - 1
if start < 0:
start = 0
while len(op) < left + right and start < len(words):
if start != index:
op.append(words[start])
start += 1
except ValueError:
pass
print op
- 这是如何工作的?
- 找到字符串中的单词
- 看看我们是否可以根据索引生成左+右词
- 取左+右字数存入op
- 打印操作
这将提取文本中所有出现的目标词,上下文为:
import re
text = ("Education shall be directed to the full development of the human personality "
"and to the strengthening of respect for human rights and fundamental freedoms.")
def search(target, text, context=6):
# It's easier to use re.findall to split the string,
# as we get rid of the punctuation
words = re.findall(r'\w+', text)
matches = (i for (i,w) in enumerate(words) if w.lower() == target)
for index in matches:
if index < context //2:
yield words[0:context+1]
elif index > len(words) - context//2 - 1:
yield words[-(context+1):]
else:
yield words[index - context//2:index + context//2 + 1]
print(list(search('the', text)))
# [['be', 'directed', 'to', 'the', 'full', 'development', 'of'],
# ['full', 'development', 'of', 'the', 'human', 'personality', 'and'],
# ['personality', 'and', 'to', 'the', 'strengthening', 'of', 'respect']]
print(list(search('shall', text)))
# [['Education', 'shall', 'be', 'directed', 'to', 'the', 'full']]
print(list(search('freedoms', text)))
# [['respect', 'for', 'human', 'rights', 'and', 'fundamental', 'freedoms']]
解决您的问题的简单方法。先把所有的词分开,然后左右选词。
def custom_search(sentence, word, n):
given_string = sentence
given_word = word
total_required = n
word_list = given_string.strip().split(" ")
length_of_words = len(word_list)
output_list = []
given_word_position = word_list.index(given_word)
word_from_left = 0
word_from_right = 0
if given_word_position + 1 > total_required / 2:
word_from_left = total_required / 2
if given_word_position + 1 + (total_required / 2) <= length_of_words:
word_from_right = total_required / 2
else:
word_from_right = length_of_words - (given_word_position + 1)
remaining_words = (total_required / 2) - word_from_right
word_from_left += remaining_words
else:
word_from_right = total_required / 2
word_from_left = given_word_position
if word_from_left + word_from_right < total_required:
remaining_words = (total_required / 2) - word_from_left
word_from_right += remaining_words
required_words = []
for i in range(given_word_position - word_from_left, word_from_right +
given_word_position + 1):
if i != given_word_position:
required_words.append(word_list[i])
return required_words
sentence = "Education shall be directed to the full development of the human personality and to the strengthening of respect for human rights and fundamental freedoms."
custom_search(sentence, "shall", 6)
>>[Education, be, directed, to , the , full]
custom_search(sentence, "development", 6)
>>['to', 'the', 'full', 'of', 'the', 'human']
我认为这里不需要正则表达式。假设文本结构良好,只需将其拆分为一个单词数组,并编写几个 if-else 语句以确保它检索到必要数量的周围单词:
def search(text, word, n):
# text is the string you are searching
# word is the word you are looking for
# n is the TOTAL number of words you want surrounding the word
words = text.split(" ") # Create an array of words from the string
position = words.index(word) # Find the position of the desired word
distance_from_end = len(words) - position # How many words are after the word in the text
if position < n // 2 + n % 2: # If there aren't enough words before...
return words[:position], words[position + 1:n + 1]
elif distance_from_end < n // 2 + n % 2: # If there aren't enough words after...
return words[position - n + distance_from_end:position], words[position + 1:]
else: # Otherwise, extract an equal number of words from both sides (take from the right if odd)
return words[position - n // 2 - n % 2:position], words[position + 1:position + 1 + n//2]
string = "Education shall be directed to the full development of the human personality and to the \
strengthening of respect for human rights and fundamental freedoms."
print search(string, "shall", 6)
# >> (['Education'], ['be', 'directed', 'to', 'the', 'full'])
print search(string, "human", 5)
# >> (['development', 'of', 'the'], ['personality', 'and'])
在您的示例中,您没有在输出中包含目标词,因此我也将其保留在外。如果您想要包含目标词,只需将函数 returns 的两个数组组合起来(在 position
加入它们)。
希望对您有所帮助!