随机播放列表以最大化相似元素之间的距离
Shuffle list to maximise distances between similar elements
在 URL 列表中
http://a.com/foo
http://b.com/bar
http://a.com/monkey
http://c.com/prune
http://a.com/bear
http://b.com/walrus
http://b.com/baz
http://b.com/plugh
我想最大化任何一对 a.com
、任何一对 b.com
等之间的距离。这需要便宜但不一定是最佳的。 (我正在使用 URL 列表从网站 a.com
、b.com
、c.com
下载文件,并且不希望以高于必要的频率访问任何特定站点。在示例中在这里,我们会连续访问 b.com
站点 3 次,应该避免这种情况。)
理想情况下,我会喜欢 Java 库,但会满足于伪代码。
Maximise sum of pairwise distances in array 似乎是一个类似的问题,但没有一个简单的答案 - 我只是想要 "good enough"
因为没有答案,所以我自己写了。它非常粗糙但有效。它读取 URL 列表,提取主机,对它们进行计数,然后用与主机的倒数频率成比例的索引填充一个鸽子洞数组。
package org.xmlcml.cmine.util;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
public class URLShuffler {
public static final Logger LOG = Logger.getLogger(URLShuffler.class);
static {
LOG.setLevel(Level.DEBUG);
}
// 以防我们需要额外的鸽巢,但它似乎不适用于中等问题
private static int TOL = 1;
private List<String> urls;
private Multiset<String> domains;
private Map<String, Integer> currentIndexByDomain;
private Map<String, Integer> countByDomain;
private List<String> outputUrls;
public URLShuffler() {
}
public void readURLs(List<String> urls) {
this.urls= urls;
domains = HashMultiset.create();
for (String url : urls) {
String domain = getDomain(url);
domains.add(domain);
}
LOG.debug(domains);
}
// this would be better using java.net.URL
private String getDomain(String url) {
int idx = url.indexOf("//");
if (idx != -1) {
url = url.substring(idx+2);
}
idx = url.indexOf("/");
String domain = url.substring(0, idx);
return domain;
}
public List<String> getShuffledUrls() {
currentIndexByDomain = new HashMap<String, Integer>();
countByDomain = new HashMap<String, Integer>();
outputUrls = new ArrayList<String>();
for (int i = 0; i < urls.size() * TOL; i++) {
outputUrls.add("");
}
// 这是包装 Guava 排序的便捷方法。
for (Multiset.Entry<String> entry : CMineUtil.getEntriesSortedByCount(domains)) {
LOG.debug(entry);
countByDomain.put(entry.getElement(), entry.getCount());
currentIndexByDomain.put(entry.getElement(), entry.getCount() - 1);
}
for (String url : urls) {
String domain = getDomain(url);
Integer currentIndex = currentIndexByDomain.get(domain);
Integer count = countByDomain.get(domain);
int slot = (urls.size() * currentIndex * TOL) / count;
currentIndexByDomain.put(domain, currentIndex - 1);
addUrl(url, slot);
}
return outputUrls;
}
private void addUrl(String url, int slot) {
boolean filled = fillLower(url, slot);
if (!filled) {
fillUpper(url, slot);
}
}
// if slot is not free run upwards till next free slot
private boolean fillUpper(String url, int slot) {
for (int i = slot; i < outputUrls.size(); i++) {
if (fill(url, i)) {
return true;
}
}
return false;
}
// if slot is not free run downwards till next free slot
private boolean fillLower(String url, int slot) {
for (int i = slot; i >= 0; i--) {
if (fill(url, i)) {
return true;
}
}
return false;
}
private boolean fill(String url, int slot) {
if (outputUrls.get(slot).equals("")) {
outputUrls.set(slot, url);
return true;
}
return false;
}
}
```
在 URL 列表中
http://a.com/foo
http://b.com/bar
http://a.com/monkey
http://c.com/prune
http://a.com/bear
http://b.com/walrus
http://b.com/baz
http://b.com/plugh
我想最大化任何一对 a.com
、任何一对 b.com
等之间的距离。这需要便宜但不一定是最佳的。 (我正在使用 URL 列表从网站 a.com
、b.com
、c.com
下载文件,并且不希望以高于必要的频率访问任何特定站点。在示例中在这里,我们会连续访问 b.com
站点 3 次,应该避免这种情况。)
理想情况下,我会喜欢 Java 库,但会满足于伪代码。 Maximise sum of pairwise distances in array 似乎是一个类似的问题,但没有一个简单的答案 - 我只是想要 "good enough"
因为没有答案,所以我自己写了。它非常粗糙但有效。它读取 URL 列表,提取主机,对它们进行计数,然后用与主机的倒数频率成比例的索引填充一个鸽子洞数组。
package org.xmlcml.cmine.util;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
public class URLShuffler {
public static final Logger LOG = Logger.getLogger(URLShuffler.class);
static {
LOG.setLevel(Level.DEBUG);
}
// 以防我们需要额外的鸽巢,但它似乎不适用于中等问题
private static int TOL = 1;
private List<String> urls;
private Multiset<String> domains;
private Map<String, Integer> currentIndexByDomain;
private Map<String, Integer> countByDomain;
private List<String> outputUrls;
public URLShuffler() {
}
public void readURLs(List<String> urls) {
this.urls= urls;
domains = HashMultiset.create();
for (String url : urls) {
String domain = getDomain(url);
domains.add(domain);
}
LOG.debug(domains);
}
// this would be better using java.net.URL
private String getDomain(String url) {
int idx = url.indexOf("//");
if (idx != -1) {
url = url.substring(idx+2);
}
idx = url.indexOf("/");
String domain = url.substring(0, idx);
return domain;
}
public List<String> getShuffledUrls() {
currentIndexByDomain = new HashMap<String, Integer>();
countByDomain = new HashMap<String, Integer>();
outputUrls = new ArrayList<String>();
for (int i = 0; i < urls.size() * TOL; i++) {
outputUrls.add("");
}
// 这是包装 Guava 排序的便捷方法。
for (Multiset.Entry<String> entry : CMineUtil.getEntriesSortedByCount(domains)) {
LOG.debug(entry);
countByDomain.put(entry.getElement(), entry.getCount());
currentIndexByDomain.put(entry.getElement(), entry.getCount() - 1);
}
for (String url : urls) {
String domain = getDomain(url);
Integer currentIndex = currentIndexByDomain.get(domain);
Integer count = countByDomain.get(domain);
int slot = (urls.size() * currentIndex * TOL) / count;
currentIndexByDomain.put(domain, currentIndex - 1);
addUrl(url, slot);
}
return outputUrls;
}
private void addUrl(String url, int slot) {
boolean filled = fillLower(url, slot);
if (!filled) {
fillUpper(url, slot);
}
}
// if slot is not free run upwards till next free slot
private boolean fillUpper(String url, int slot) {
for (int i = slot; i < outputUrls.size(); i++) {
if (fill(url, i)) {
return true;
}
}
return false;
}
// if slot is not free run downwards till next free slot
private boolean fillLower(String url, int slot) {
for (int i = slot; i >= 0; i--) {
if (fill(url, i)) {
return true;
}
}
return false;
}
private boolean fill(String url, int slot) {
if (outputUrls.get(slot).equals("")) {
outputUrls.set(slot, url);
return true;
}
return false;
}
}
```