获得最少数量的子词
Getting the least amount of sub words
解决方案适应return最大最小字:
import java.util.*;
public class SubWordsFinder
{
private Set<String> words;
public SubWordsFinder(Set<String> words)
{
this.words = words;
}
public List<String> findSubWords(String word) throws NoSolutionFoundException
{
List<String> bestSolution = new ArrayList<>();
if (word.isEmpty())
{
return bestSolution;
}
long length = word.length();
int[] pointer = new int[]{0, 0};
LinkedList<int[]> pointerStack = new LinkedList<>();
LinkedList<String> currentSolution = new LinkedList<>();
while (true)
{
boolean backtrack = false;
for (int end = pointer[1] + 1; end <= length; end++)
{
if (end == length)
{
backtrack = true;
}
pointer[1] = end;
String wordToFind = word.substring(pointer[0], end);
if (words.contains(wordToFind))
{
currentSolution.add(wordToFind);
if (backtrack)
{
if (bestSolution.isEmpty() || (currentSolution.size() <= bestSolution.size() && getSmallestSubWordLength(currentSolution) > getSmallestSubWordLength(bestSolution)))
{
bestSolution = new ArrayList<>(currentSolution);
}
currentSolution.removeLast();
} else if (!bestSolution.isEmpty() && currentSolution.size() == bestSolution.size())
{
currentSolution.removeLast();
backtrack = true;
} else
{
int[] nextPointer = new int[]{end, end};
pointerStack.add(pointer);
pointer = nextPointer;
}
break;
}
}
if (backtrack)
{
if (pointerStack.isEmpty())
{
break;
} else
{
currentSolution.removeLast();
pointer = pointerStack.removeLast();
}
}
}
if (bestSolution.isEmpty())
{
throw new NoSolutionFoundException();
} else
{
return bestSolution;
}
}
private int getSmallestSubWordLength(List<String> words)
{
int length = Integer.MAX_VALUE;
for (String word : words)
{
if (word.length() < length)
{
length = word.length();
}
}
return length;
}
public class NoSolutionFoundException extends Exception
{
private static final long serialVersionUID = 1L;
}
}
我有一个 String
包含小写的常规英语单词。假设这个 String
已经分解为所有可能子词的 List
:
public List<String> getSubWords(String text)
{
List<String> words = new ArrayList<>();
for (int startingIndex = 0; startingIndex < text.length() + 1; startingIndex++)
{
for (int endIndex = startingIndex + 1; endIndex < text.length() + 1; endIndex++)
{
String subString = text.substring(startingIndex, endIndex);
if (contains(subString))
{
words.add(subString);
}
}
}
Comparator<String> lengthComparator = (firstItem, secondItem) ->
{
if (firstItem.length() > secondItem.length())
{
return -1;
}
if (secondItem.length() > firstItem.length())
{
return 1;
}
return 0;
};
// Sort the list in descending String length order
Collections.sort(words, lengthComparator);
return words;
}
如何找到构成原始字符串的最少数量的子词?
例如:
String text = "updatescrollbar";
List<String> leastWords = getLeastSubWords(text);
System.out.println(leastWords);
输出:
[update, scroll, bar]
我不确定如何遍历所有可能性,因为它们会根据所选单词而变化。开始应该是这样的:
public List<String> getLeastSubWords(String text)
{
String textBackup = text;
List<String> subWords = getSubWords(text);
System.out.println(subWords);
List<List<String>> containing = new ArrayList<>();
List<String> validWords = new ArrayList<>();
for (String subWord : subWords)
{
if (text.startsWith(subWord))
{
validWords.add(subWord);
text = text.substring(subWord.length());
}
}
// Did we find a valid words distribution?
if (text.length() == 0)
{
System.out.println(validWords.size());
}
return null;
}
注:
这与 this 问题有关。
这要求很多场景的可能性。
你的例子 (updatescrollbar
) 已经 up date ate update scroll bar
并且仍然很简单,但是如果你有一个词,它有 as 子词,在字符串。
因此,要完成此操作,您必须多次迭代子词列表,跟踪当前最短的有效版本匹配您的文本,并继续迭代,直到尝试所有变体。
您可以减少变体的数量,例如通过使用一种算法,该算法将剩余的要匹配的 space 考虑在内:
- 按长度对子词进行排序,并尝试首先匹配最长词的文本:可能的子词长度=文本-/- 匹配的文本。
这将使用包含,因此要匹配的文本仍然可以在已经匹配的单词之前和之后:为您的文本使用数组,以便更容易跟踪匹配的文本。
UPDATE:如果反转内部 foreach,下面的算法会更有效。在这种情况下,将首先检查较长的单词。
这是回溯算法的典型情况。
将你的单词存储在 TreeSet
中,并实现此算法:
将开始和结束指针设置为 0
。创建一个堆栈来存储以前版本的指针。
从起始指针生成子串,同时增加结束指针,查找已知单词。如果找到一个单词,将其存储在一个数组中,并将单词的长度添加到起始指针,将此指针压入堆栈。如果没有找到已知单词或到达最后一个字符,则将开始和结束指针设置为从堆栈中弹出的前一个值(回溯)。重复第 2 步。
要找到最少的子词,您必须存储以前的最佳解决方案,并将其字数与当前解决方案的字数进行比较。
下面是一个示例实现。它包含一些优化实验:没有递归,在错误分支上回溯等。您可以添加更多优化(例如,跟踪使用的起始位置,或查找可能的子词起始位置),但可能没有必要,如果参数是一个不太长的词。
public class SubWordFinder {
private TreeSet<String> words = new TreeSet<String>();
public SubWordFinder(Collection<String> words) {
this.words.addAll(words);
}
public List<String> findSubWords(String word) throws NoSolutionFoundException {
List<String> bestSolution = new ArrayList<String>();
if (word.isEmpty()) {
return bestSolution;
}
long length = word.length();
int[] pointer = new int[]{0, 0};
LinkedList<int[]> pointerStack = new LinkedList<int[]>();
LinkedList<String> currentSolution = new LinkedList<String>();
while (true) {
boolean backtrack = false;
for (int end = pointer[1] + 1; end <= length; end++) {
if (end == length) {
backtrack = true;
}
pointer[1] = end;
String wordToFind = word.substring(pointer[0], end);
if (words.contains(wordToFind)) {
currentSolution.add(wordToFind);
if (backtrack) {
if (bestSolution.isEmpty() || currentSolution.size() < bestSolution.size()) {
bestSolution = new ArrayList<String>(currentSolution);
}
currentSolution.removeLast();
} else if (!bestSolution.isEmpty() && currentSolution.size() == bestSolution.size()) {
currentSolution.removeLast();
backtrack = true;
} else {
int nextStart = end;
int[] nextPointer = new int[]{nextStart, nextStart};
pointerStack.add(pointer);
pointer = nextPointer;
}
break;
}
}
if (backtrack) {
if (pointerStack.isEmpty()) {
break;
} else {
currentSolution.removeLast();
pointer = pointerStack.removeLast();
}
}
}
if (bestSolution.isEmpty()) {
throw new NoSolutionFoundException();
} else {
return bestSolution;
}
}
public class NoSolutionFoundException extends Exception {
private static final long serialVersionUID = 1L;
}
}
测试:
public class SubWordFinderTest {
@Test
public void generalTest() throws SubWordFinder.NoSolutionFoundException {
List<String> words = new ArrayList<String>();
words.add("ab");
words.add("abc");
words.add("cd");
words.add("cde");
words.add("de");
words.add("e");
SubWordFinder finder = new SubWordFinder(words);
assertEquals(new ArrayList<String>(), finder.findSubWords(""));
assertEquals(Arrays.asList("ab", "cde"), finder.findSubWords("abcde"));
assertEquals(Arrays.asList("cd", "cde"), finder.findSubWords("cdcde"));
assertEquals(Arrays.asList("abc", "cd"), finder.findSubWords("abccd"));
assertEquals(Arrays.asList("de", "e", "e", "e", "e"), finder.findSubWords("deeeee"));
try {
finder.findSubWords("ae");
fail();
} catch (SubWordFinder.NoSolutionFoundException e) {
}
try {
finder.findSubWords("abcccd");
fail();
} catch (SubWordFinder.NoSolutionFoundException e) {
}
try {
finder.findSubWords("abcdex");
fail();
} catch (SubWordFinder.NoSolutionFoundException e) {
}
}
@Test
public void dictionaryTest() throws IOException, SubWordFinder.NoSolutionFoundException {
String resourceDir = "/path_to_resources";
InputStream inputStream = getClass().getResource(resourceDir + "/20k.txt").openStream();
InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
List<String> words = new ArrayList<String>();
String line;
while ((line = bufferedReader.readLine()) != null) {
words.add(line);
}
SubWordFinder finder = new SubWordFinder(words);
assertEquals(Arrays.asList("bromide", "freshet"), finder.findSubWords("bromidefreshet"));
}
}
解决方案
import java.util.*;
public class SubWordsFinder
{
private Set<String> words;
public SubWordsFinder(Set<String> words)
{
this.words = words;
}
public List<String> findSubWords(String word) throws NoSolutionFoundException
{
List<String> bestSolution = new ArrayList<>();
if (word.isEmpty())
{
return bestSolution;
}
long length = word.length();
int[] pointer = new int[]{0, 0};
LinkedList<int[]> pointerStack = new LinkedList<>();
LinkedList<String> currentSolution = new LinkedList<>();
while (true)
{
boolean backtrack = false;
for (int end = pointer[1] + 1; end <= length; end++)
{
if (end == length)
{
backtrack = true;
}
pointer[1] = end;
String wordToFind = word.substring(pointer[0], end);
if (words.contains(wordToFind))
{
currentSolution.add(wordToFind);
if (backtrack)
{
if (bestSolution.isEmpty() || (currentSolution.size() <= bestSolution.size() && getSmallestSubWordLength(currentSolution) > getSmallestSubWordLength(bestSolution)))
{
bestSolution = new ArrayList<>(currentSolution);
}
currentSolution.removeLast();
} else if (!bestSolution.isEmpty() && currentSolution.size() == bestSolution.size())
{
currentSolution.removeLast();
backtrack = true;
} else
{
int[] nextPointer = new int[]{end, end};
pointerStack.add(pointer);
pointer = nextPointer;
}
break;
}
}
if (backtrack)
{
if (pointerStack.isEmpty())
{
break;
} else
{
currentSolution.removeLast();
pointer = pointerStack.removeLast();
}
}
}
if (bestSolution.isEmpty())
{
throw new NoSolutionFoundException();
} else
{
return bestSolution;
}
}
private int getSmallestSubWordLength(List<String> words)
{
int length = Integer.MAX_VALUE;
for (String word : words)
{
if (word.length() < length)
{
length = word.length();
}
}
return length;
}
public class NoSolutionFoundException extends Exception
{
private static final long serialVersionUID = 1L;
}
}
我有一个 String
包含小写的常规英语单词。假设这个 String
已经分解为所有可能子词的 List
:
public List<String> getSubWords(String text)
{
List<String> words = new ArrayList<>();
for (int startingIndex = 0; startingIndex < text.length() + 1; startingIndex++)
{
for (int endIndex = startingIndex + 1; endIndex < text.length() + 1; endIndex++)
{
String subString = text.substring(startingIndex, endIndex);
if (contains(subString))
{
words.add(subString);
}
}
}
Comparator<String> lengthComparator = (firstItem, secondItem) ->
{
if (firstItem.length() > secondItem.length())
{
return -1;
}
if (secondItem.length() > firstItem.length())
{
return 1;
}
return 0;
};
// Sort the list in descending String length order
Collections.sort(words, lengthComparator);
return words;
}
如何找到构成原始字符串的最少数量的子词?
例如:
String text = "updatescrollbar";
List<String> leastWords = getLeastSubWords(text);
System.out.println(leastWords);
输出:
[update, scroll, bar]
我不确定如何遍历所有可能性,因为它们会根据所选单词而变化。开始应该是这样的:
public List<String> getLeastSubWords(String text)
{
String textBackup = text;
List<String> subWords = getSubWords(text);
System.out.println(subWords);
List<List<String>> containing = new ArrayList<>();
List<String> validWords = new ArrayList<>();
for (String subWord : subWords)
{
if (text.startsWith(subWord))
{
validWords.add(subWord);
text = text.substring(subWord.length());
}
}
// Did we find a valid words distribution?
if (text.length() == 0)
{
System.out.println(validWords.size());
}
return null;
}
注:
这与 this 问题有关。
这要求很多场景的可能性。
你的例子 (updatescrollbar
) 已经 up date ate update scroll bar
并且仍然很简单,但是如果你有一个词,它有 as 子词,在字符串。
因此,要完成此操作,您必须多次迭代子词列表,跟踪当前最短的有效版本匹配您的文本,并继续迭代,直到尝试所有变体。
您可以减少变体的数量,例如通过使用一种算法,该算法将剩余的要匹配的 space 考虑在内:
- 按长度对子词进行排序,并尝试首先匹配最长词的文本:可能的子词长度=文本-/- 匹配的文本。 这将使用包含,因此要匹配的文本仍然可以在已经匹配的单词之前和之后:为您的文本使用数组,以便更容易跟踪匹配的文本。
UPDATE:如果反转内部 foreach,下面的算法会更有效。在这种情况下,将首先检查较长的单词。
这是回溯算法的典型情况。
将你的单词存储在 TreeSet
中,并实现此算法:
将开始和结束指针设置为
0
。创建一个堆栈来存储以前版本的指针。从起始指针生成子串,同时增加结束指针,查找已知单词。如果找到一个单词,将其存储在一个数组中,并将单词的长度添加到起始指针,将此指针压入堆栈。如果没有找到已知单词或到达最后一个字符,则将开始和结束指针设置为从堆栈中弹出的前一个值(回溯)。重复第 2 步。
要找到最少的子词,您必须存储以前的最佳解决方案,并将其字数与当前解决方案的字数进行比较。
下面是一个示例实现。它包含一些优化实验:没有递归,在错误分支上回溯等。您可以添加更多优化(例如,跟踪使用的起始位置,或查找可能的子词起始位置),但可能没有必要,如果参数是一个不太长的词。
public class SubWordFinder {
private TreeSet<String> words = new TreeSet<String>();
public SubWordFinder(Collection<String> words) {
this.words.addAll(words);
}
public List<String> findSubWords(String word) throws NoSolutionFoundException {
List<String> bestSolution = new ArrayList<String>();
if (word.isEmpty()) {
return bestSolution;
}
long length = word.length();
int[] pointer = new int[]{0, 0};
LinkedList<int[]> pointerStack = new LinkedList<int[]>();
LinkedList<String> currentSolution = new LinkedList<String>();
while (true) {
boolean backtrack = false;
for (int end = pointer[1] + 1; end <= length; end++) {
if (end == length) {
backtrack = true;
}
pointer[1] = end;
String wordToFind = word.substring(pointer[0], end);
if (words.contains(wordToFind)) {
currentSolution.add(wordToFind);
if (backtrack) {
if (bestSolution.isEmpty() || currentSolution.size() < bestSolution.size()) {
bestSolution = new ArrayList<String>(currentSolution);
}
currentSolution.removeLast();
} else if (!bestSolution.isEmpty() && currentSolution.size() == bestSolution.size()) {
currentSolution.removeLast();
backtrack = true;
} else {
int nextStart = end;
int[] nextPointer = new int[]{nextStart, nextStart};
pointerStack.add(pointer);
pointer = nextPointer;
}
break;
}
}
if (backtrack) {
if (pointerStack.isEmpty()) {
break;
} else {
currentSolution.removeLast();
pointer = pointerStack.removeLast();
}
}
}
if (bestSolution.isEmpty()) {
throw new NoSolutionFoundException();
} else {
return bestSolution;
}
}
public class NoSolutionFoundException extends Exception {
private static final long serialVersionUID = 1L;
}
}
测试:
public class SubWordFinderTest {
@Test
public void generalTest() throws SubWordFinder.NoSolutionFoundException {
List<String> words = new ArrayList<String>();
words.add("ab");
words.add("abc");
words.add("cd");
words.add("cde");
words.add("de");
words.add("e");
SubWordFinder finder = new SubWordFinder(words);
assertEquals(new ArrayList<String>(), finder.findSubWords(""));
assertEquals(Arrays.asList("ab", "cde"), finder.findSubWords("abcde"));
assertEquals(Arrays.asList("cd", "cde"), finder.findSubWords("cdcde"));
assertEquals(Arrays.asList("abc", "cd"), finder.findSubWords("abccd"));
assertEquals(Arrays.asList("de", "e", "e", "e", "e"), finder.findSubWords("deeeee"));
try {
finder.findSubWords("ae");
fail();
} catch (SubWordFinder.NoSolutionFoundException e) {
}
try {
finder.findSubWords("abcccd");
fail();
} catch (SubWordFinder.NoSolutionFoundException e) {
}
try {
finder.findSubWords("abcdex");
fail();
} catch (SubWordFinder.NoSolutionFoundException e) {
}
}
@Test
public void dictionaryTest() throws IOException, SubWordFinder.NoSolutionFoundException {
String resourceDir = "/path_to_resources";
InputStream inputStream = getClass().getResource(resourceDir + "/20k.txt").openStream();
InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
List<String> words = new ArrayList<String>();
String line;
while ((line = bufferedReader.readLine()) != null) {
words.add(line);
}
SubWordFinder finder = new SubWordFinder(words);
assertEquals(Arrays.asList("bromide", "freshet"), finder.findSubWords("bromidefreshet"));
}
}