随机播放列表以最大化相似元素之间的距离

Shuffle list to maximise distances between similar elements

在 URL 列表中

http://a.com/foo http://b.com/bar http://a.com/monkey http://c.com/prune http://a.com/bear http://b.com/walrus http://b.com/baz http://b.com/plugh

我想最大化任何一对 a.com、任何一对 b.com 等之间的距离。这需要便宜但不一定是最佳的。 (我正在使用 URL 列表从网站 a.comb.comc.com 下载文件,并且不希望以高于必要的频率访问任何特定站点。在示例中在这里,我们会连续访问 b.com 站点 3 次,应该避免这种情况。)

理想情况下,我会喜欢 Java 库,但会满足于伪代码。 Maximise sum of pairwise distances in array 似乎是一个类似的问题,但没有一个简单的答案 - 我只是想要 "good enough"

因为没有答案,所以我自己写了。它非常粗糙但有效。它读取 URL 列表,提取主机,对它们进行计数,然后用与主机的倒数频率成比例的索引填充一个鸽子洞数组。

package org.xmlcml.cmine.util;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;

public class URLShuffler {

    public static final Logger LOG = Logger.getLogger(URLShuffler.class);
    static {
        LOG.setLevel(Level.DEBUG);
    }

// 以防我们需要额外的鸽巢,但它似乎不适用于中等问题

    private static int TOL = 1;
    private List<String> urls;
    private Multiset<String> domains;
    private Map<String, Integer> currentIndexByDomain;
    private Map<String, Integer> countByDomain;
    private List<String> outputUrls;

    public URLShuffler() {

    }

    public void readURLs(List<String> urls) {
        this.urls= urls;
        domains = HashMultiset.create();
        for (String url : urls) {
            String domain = getDomain(url);
            domains.add(domain);
        }
        LOG.debug(domains);
    }

// this would be better using java.net.URL

    private String getDomain(String url) {
        int idx = url.indexOf("//");
        if (idx != -1) {
            url = url.substring(idx+2);
        }
        idx = url.indexOf("/");
        String domain = url.substring(0,  idx);
        return domain;
    }

    public List<String> getShuffledUrls() {
        currentIndexByDomain = new HashMap<String, Integer>();
        countByDomain = new HashMap<String, Integer>();

        outputUrls = new ArrayList<String>();
        for (int i = 0; i < urls.size() * TOL; i++) {
            outputUrls.add("");
        }

// 这是包装 Guava 排序的便捷方法。

        for (Multiset.Entry<String> entry : CMineUtil.getEntriesSortedByCount(domains)) {
            LOG.debug(entry);
            countByDomain.put(entry.getElement(), entry.getCount());
            currentIndexByDomain.put(entry.getElement(), entry.getCount() - 1);
        }
        for (String url : urls) {
            String domain = getDomain(url);
            Integer currentIndex = currentIndexByDomain.get(domain);
            Integer count = countByDomain.get(domain);
            int slot = (urls.size() * currentIndex * TOL) / count;
            currentIndexByDomain.put(domain, currentIndex - 1);
            addUrl(url, slot);
        }
        return outputUrls;
    }

    private void addUrl(String url, int slot) {
        boolean filled = fillLower(url, slot);
        if (!filled) {
            fillUpper(url, slot);
        }
    }

// if slot is not free run upwards till next free slot

    private boolean fillUpper(String url, int slot) {
        for (int i = slot; i < outputUrls.size(); i++) {
            if (fill(url, i)) {
                return true;
            }
        }
        return false;
    }

// if slot is not free run downwards till next free slot
    private boolean fillLower(String url, int slot) {
        for (int i = slot; i >= 0; i--) {
            if (fill(url, i)) {
                return true;
            }
        }
        return false;
    }

    private boolean fill(String url, int slot) {
        if (outputUrls.get(slot).equals("")) {
            outputUrls.set(slot, url);
            return true;
        }
        return false;
    }
}

```