Java 代理发现机器人
Java Proxy Discovering Bot
我写了一个 class,ProxyFinder,它连接到随机 ip 并首先 ping 它们,如果它们响应,则尝试通过通用代理端口创建一个 http 代理连接。
目前,它只是连接到随机ips。这个比较快,一个小时发现几个代理。但是,我想以某种方式检查我之前是否已经连接到一个 ip。首先,我尝试将它们保存在一个列表中,但那使用了超过 10GB 的内存。我在下面的代码中包含了一个方法,该方法使用 RandomAccessFile 将数据写入缓存,但这在搜索每个连接的整个文件随着它变大。
我正在以尽可能小的格式存储数据,每个 ip 仅四个字节。即使,这是 4 * 256 * 256 *256 * 256 字节.. = 16gb 的原始 ram.. 或每次要测试另一个 ip 时要搜索的 16gb 文件。
我还尝试创建一个单独的线程来生成 ips,根据文件检查它们,然后将它们添加到探测线程可以从中提取的队列中。它也跟不上探测线程。
我怎样才能快速检查我是否已经连接到一个 IP,而不会非常慢或使用大量内存?
package net;
import java.io.File;
import java.io.RandomAccessFile;
import java.net.HttpURLConnection;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicInteger;
/**
*
* @author Colby
*/
public class ProxyFinder {
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws Exception {
int[] ports = {
1080, 3128, 3128, 8080
};
System.out.println("Starting network probe");
AtomicInteger counter = new AtomicInteger();
for (int i = 0; i < 500; i++) {
new Thread(() -> {
do {
try {
byte[] addrBytes = randomAddress();//could be getNextAddress also
if (addrBytes == null) {
break;
}
InetAddress addr = InetAddress.getByAddress(addrBytes);
if (ping(addr)) {
float percent = (float) ((counter.get() / (256f * 256f * 256f * 256f)) * 100F);
if (counter.incrementAndGet() % 10000 == 0) {
System.out.println("Searching " + percent + "% network search");
}
for (int port : ports) {
try {
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(addr, port));
HttpURLConnection con = (HttpURLConnection) new URL("http://google.com").openConnection(proxy);
con.setConnectTimeout(1000);
con.setReadTimeout(1000);
con.setRequestMethod("GET");
con.setRequestProperty("User-Agent", "Mozilla/5.0");
con.getContent();
con.disconnect();
System.out.println("Proxy found!" + addr.getHostAddress() + ":" + port + " Found at " + percent + "% network search");
} catch (Exception e) {
}
}
//
//System.out.println("Ping response: --" + addr.getHostAddress() + "-- Attempt: " + counter.get() + " Percent: " + percent + "%");
} else {
//System.out.println("Ping response failed: " + addr.getHostAddress() + " attempt " + counter.incrementAndGet());
}
} catch (Exception e) {
//e.printStackTrace();
}
} while (true);
}).start();
}
}
private static RandomAccessFile cache;
private static byte[] getNextAddress() throws Exception {
if (cache == null) {
cache = new RandomAccessFile(File.createTempFile("abc", ".tmp"), "rw");
}
byte[] check;
checkFile:
{
byte[] addr = new byte[4];
do {
check = randomAddress();
inner:
{
cache.seek(0);
while (cache.length() - cache.getFilePointer() > 0) {
cache.readFully(addr);
if (Arrays.equals(check, addr)) {
break inner;
}
}
cache.write(check);
break checkFile;
}
} while (true);
}
return check;
}
private static byte[] randomAddress() {
return new byte[]{(byte) (Math.random() * 256), (byte) (Math.random() * 256), (byte) (Math.random() * 256), (byte) (Math.random() * 256)};
}
private static boolean ping(InetAddress addr) throws Exception {
return addr.isReachable(500);
}
}
此外,如果有人想知道,我已经使用这个 运行 12 个小时了,它发现了大约 50 个代理,并且 ping 了大约 2.09664E-4% 的 ip 范围,大约 120 万ips。对于分配的带宽来说还不错 (0.5Mbps)
编辑:我开始认为存储和检查所有这些 IP 的开销可能比在搜索 ip 范围快结束时简单地连接到许多重复项还要大。
使用像 MySql 这样的数据库和具有 1 级和 2 级缓存的 hibernarte。
如果您使用休眠配置缓存并调整您的数据库以使用几 GB 的缓存,它将比 RAM 更快。我认为他们都这样做。可以配置外部缓存,如 ehcahe,当配置为存在于另一个进程 + 文件上,并限制大小和时间。 Db 知道如何比纯 RAM 更快地索引和查找内容 - 在您的 IP
大小的情况下
此外,您可以通过按第一个字符、第二个字符等对 table 数据和索引进行分区来改进
由于数据量大,我不会存储整个 IP 地址。将它们存储在 BitSet
的数组中会消耗更少的内存。
编辑 删除了以前的代码版本,它不正确
以下版本生成随机地址并将它们保存在文件中。如果找到前一个 运行 的持久性文件,则从该文件中恢复所见地址的信息。
以下情况在初始版本中未正确处理:
assuming that no address was already seen
1.0.0.1 - seen false
2.0.0.2 - seen false
2.0.0.1 - seen true, which was wrong and is correctly handled by code below
有关更多信息,请参阅代码中的注释。
public class KeepSeenAddresses {
static final int FILE_BUFFER_SIZE = 81_920;
static final int RANGES_SIZE = 256;
// to store 256 ranges of 255*255*255+1 addresses
static BitSet[] ranges;
// Random(1) is taken only for demonstration purpose, so the second
// application run will find the same seen addresses from previous run
static Random random = new Random(1);
// for normal use it's better to have better randomness
//static Random random = new Random(System.currentTimeMillis());
public static void main(String[] args)
throws IOException, ClassNotFoundException {
if (!readRanges()) {
initRanges();
}
// this case was failing in the initial solution
// uncomment this block to see how all edge cases
// which where mentioned in other comments are handled
/*
byte[][] addresses = {
{1, 0, 0, 1},
{2, 0, 0, 2},
{2, 0, 0, 1},
{1, 2, 3, 4},
{4, 3, 2, 1},
{(byte)128, 0, 0, 0},
{(byte)255, (byte)255, (byte)255, (byte)255}
};
seenAddress(addresses[0]);
seenAddress(addresses[1]);
seenAddress(addresses[3]);
seenAddress(addresses[5]);
seenAddress(addresses[6]);
for (byte[] addressBytes : addresses) {
System.out.printf("seen %s before: %s%n",
prettyAddress(addressBytes),
seenBefore(addressBytes)
);
}
*/
processAddresses();
persistRanges();
}
/**
* Read the seen addresses from a file.
*
* @return <code>true</code> if the file was found and has the expected
* number of ranges, otherwise <code>false</code>
* @throws IOException
* @throws ClassNotFoundException
*/
private static boolean readRanges() throws IOException, ClassNotFoundException {
File rangesStore = new File("addresses.bin");
if (!rangesStore.exists()) {
return false;
}
System.out.print("found previous rangesStore... ");
try (ObjectInputStream ois = new ObjectInputStream(
new BufferedInputStream(
new FileInputStream(rangesStore), FILE_BUFFER_SIZE
)
)) {
ranges = (BitSet[]) ois.readObject();
}
if (ranges.length != RANGES_SIZE) {
System.out.printf("wrong size of rangesStore: expected %d"
+ " found: %d%n", RANGES_SIZE, ranges.length);
return false;
} else {
System.out.printf("restored ranges: %d%n", ranges.length);
return true;
}
}
/**
* Initialize the address ranges array. All address flags will be set to
* <code>false</code>.
*/
private static void initRanges() {
System.out.print("initialize new rangesStore... ");
ranges = new BitSet[RANGES_SIZE];
for (int i = 0; i < RANGES_SIZE; i++) {
BitSet bitSet = new BitSet(255 * 255 * 255 + 1);
for (int j = 0; j < 255 * 255 * 255 + 1; j++) {
bitSet.clear(j);
}
ranges[i] = bitSet;
}
System.out.printf("initialized ranges: %d%n", RANGES_SIZE);
}
/**
* For demonstration purpose.<br>
* Generates some random IPv4 addresses. If the address was not seen before
* the flag for this address will be set to <code>true</code>.
*/
private static void processAddresses() {
for (int i = 0; i < 10; i++) {
byte[] addrBytes = randomAddress();
boolean seenBefore = seenBefore(addrBytes);
if (!seenBefore) {
seenAddress(addrBytes);
seenBefore = false;
}
System.out.printf("seen %s before: %s%n",
prettyAddress(addrBytes),
seenBefore
);
}
}
/**
* Persist the address ranges array. The file size is around 500MB.
*
* @throws IOException
*/
private static void persistRanges() throws IOException {
System.out.print("persist rangesStore... ");
try (ObjectOutputStream oos = new ObjectOutputStream(
new BufferedOutputStream(
new FileOutputStream("addresses.bin"), FILE_BUFFER_SIZE)
)) {
oos.writeObject(ranges);
}
System.out.printf("written ranges: %d%n", ranges.length);
}
/**
* Keep a flag which address has been seen already.
*
* @param addrBytes IPv4 address in four bytes
*/
static void seenAddress(byte[] addrBytes) {
int rangeIndex = (int) addrBytes[0] & 0xff;
int rangeOffset = ((int) addrBytes[1] & 0xff * 0xffff)
+ ((int) addrBytes[2] & 0xff * 0xff)
+ ((int) addrBytes[3] & 0xff);
ranges[rangeIndex].set(rangeOffset);
}
/**
* Check if the passed address was seen before.
*
* @param addrBytes IPv4 address in four bytes
* @return <code>true</code> if the address was seen before, otherwise
* <code>false</code>
*/
static boolean seenBefore(byte[] addrBytes) {
int rangeIndex = (int) addrBytes[0] & 0xff;
int rangeOffset = ((int) addrBytes[1] & 0xff * 0xffff) + ((int) addrBytes[2] & 0xff * 0xff) + ((int) addrBytes[3] & 0xff);
return ranges[rangeIndex].get(rangeOffset);
}
/**
* Convert the IPv4 address into pretty string.
*
* @param addrBytes IPv4 address in four bytes
* @return pretty String of the IPv4 address
*/
static String prettyAddress(byte[] addrBytes) {
return String.format("%03d.%03d.%03d.%03d",
(int) addrBytes[0] & 0xff,
(int) addrBytes[1] & 0xff,
(int) addrBytes[2] & 0xff,
(int) addrBytes[3] & 0xff);
}
/**
* Generate a random IPv4 address.
*
* @return four bytes of a random generated IPv4 address
*/
private static byte[] randomAddress() {
byte[] bytes = new byte[4];
for (int i = 0; i < bytes.length; i++) {
bytes[i] = (byte) random.nextInt(256);
}
return bytes;
}
}
我从这里移植了另一个解决方案的代码来解决这个问题:
上述问题的答案深入解释了以下代码的工作原理。如果其他人想 post 在此线程上获得更深入的答案,我将给予答案。
static BitSet set;
static int pos(int i, int j, int k, int m) {
return ((256*256*256) * i) + ((256*256) * j) + (256 * k) + m;
}
static boolean get(byte[] addr) {
return set.get(pos(addr[0], addr[1], addr[2], addr[3]));
}
static void set(byte[] addr, boolean flag) {
set.set(pos(addr[0], addr[1], addr[2], addr[3]), flag);
}
我写了一个 class,ProxyFinder,它连接到随机 ip 并首先 ping 它们,如果它们响应,则尝试通过通用代理端口创建一个 http 代理连接。
目前,它只是连接到随机ips。这个比较快,一个小时发现几个代理。但是,我想以某种方式检查我之前是否已经连接到一个 ip。首先,我尝试将它们保存在一个列表中,但那使用了超过 10GB 的内存。我在下面的代码中包含了一个方法,该方法使用 RandomAccessFile 将数据写入缓存,但这在搜索每个连接的整个文件随着它变大。
我正在以尽可能小的格式存储数据,每个 ip 仅四个字节。即使,这是 4 * 256 * 256 *256 * 256 字节.. = 16gb 的原始 ram.. 或每次要测试另一个 ip 时要搜索的 16gb 文件。
我还尝试创建一个单独的线程来生成 ips,根据文件检查它们,然后将它们添加到探测线程可以从中提取的队列中。它也跟不上探测线程。
我怎样才能快速检查我是否已经连接到一个 IP,而不会非常慢或使用大量内存?
package net;
import java.io.File;
import java.io.RandomAccessFile;
import java.net.HttpURLConnection;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicInteger;
/**
*
* @author Colby
*/
public class ProxyFinder {
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws Exception {
int[] ports = {
1080, 3128, 3128, 8080
};
System.out.println("Starting network probe");
AtomicInteger counter = new AtomicInteger();
for (int i = 0; i < 500; i++) {
new Thread(() -> {
do {
try {
byte[] addrBytes = randomAddress();//could be getNextAddress also
if (addrBytes == null) {
break;
}
InetAddress addr = InetAddress.getByAddress(addrBytes);
if (ping(addr)) {
float percent = (float) ((counter.get() / (256f * 256f * 256f * 256f)) * 100F);
if (counter.incrementAndGet() % 10000 == 0) {
System.out.println("Searching " + percent + "% network search");
}
for (int port : ports) {
try {
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(addr, port));
HttpURLConnection con = (HttpURLConnection) new URL("http://google.com").openConnection(proxy);
con.setConnectTimeout(1000);
con.setReadTimeout(1000);
con.setRequestMethod("GET");
con.setRequestProperty("User-Agent", "Mozilla/5.0");
con.getContent();
con.disconnect();
System.out.println("Proxy found!" + addr.getHostAddress() + ":" + port + " Found at " + percent + "% network search");
} catch (Exception e) {
}
}
//
//System.out.println("Ping response: --" + addr.getHostAddress() + "-- Attempt: " + counter.get() + " Percent: " + percent + "%");
} else {
//System.out.println("Ping response failed: " + addr.getHostAddress() + " attempt " + counter.incrementAndGet());
}
} catch (Exception e) {
//e.printStackTrace();
}
} while (true);
}).start();
}
}
private static RandomAccessFile cache;
private static byte[] getNextAddress() throws Exception {
if (cache == null) {
cache = new RandomAccessFile(File.createTempFile("abc", ".tmp"), "rw");
}
byte[] check;
checkFile:
{
byte[] addr = new byte[4];
do {
check = randomAddress();
inner:
{
cache.seek(0);
while (cache.length() - cache.getFilePointer() > 0) {
cache.readFully(addr);
if (Arrays.equals(check, addr)) {
break inner;
}
}
cache.write(check);
break checkFile;
}
} while (true);
}
return check;
}
private static byte[] randomAddress() {
return new byte[]{(byte) (Math.random() * 256), (byte) (Math.random() * 256), (byte) (Math.random() * 256), (byte) (Math.random() * 256)};
}
private static boolean ping(InetAddress addr) throws Exception {
return addr.isReachable(500);
}
}
此外,如果有人想知道,我已经使用这个 运行 12 个小时了,它发现了大约 50 个代理,并且 ping 了大约 2.09664E-4% 的 ip 范围,大约 120 万ips。对于分配的带宽来说还不错 (0.5Mbps)
编辑:我开始认为存储和检查所有这些 IP 的开销可能比在搜索 ip 范围快结束时简单地连接到许多重复项还要大。
使用像 MySql 这样的数据库和具有 1 级和 2 级缓存的 hibernarte。
如果您使用休眠配置缓存并调整您的数据库以使用几 GB 的缓存,它将比 RAM 更快。我认为他们都这样做。可以配置外部缓存,如 ehcahe,当配置为存在于另一个进程 + 文件上,并限制大小和时间。 Db 知道如何比纯 RAM 更快地索引和查找内容 - 在您的 IP
大小的情况下此外,您可以通过按第一个字符、第二个字符等对 table 数据和索引进行分区来改进
由于数据量大,我不会存储整个 IP 地址。将它们存储在 BitSet
的数组中会消耗更少的内存。
编辑 删除了以前的代码版本,它不正确
以下版本生成随机地址并将它们保存在文件中。如果找到前一个 运行 的持久性文件,则从该文件中恢复所见地址的信息。
以下情况在初始版本中未正确处理:
assuming that no address was already seen
1.0.0.1 - seen false
2.0.0.2 - seen false
2.0.0.1 - seen true, which was wrong and is correctly handled by code below
有关更多信息,请参阅代码中的注释。
public class KeepSeenAddresses {
static final int FILE_BUFFER_SIZE = 81_920;
static final int RANGES_SIZE = 256;
// to store 256 ranges of 255*255*255+1 addresses
static BitSet[] ranges;
// Random(1) is taken only for demonstration purpose, so the second
// application run will find the same seen addresses from previous run
static Random random = new Random(1);
// for normal use it's better to have better randomness
//static Random random = new Random(System.currentTimeMillis());
public static void main(String[] args)
throws IOException, ClassNotFoundException {
if (!readRanges()) {
initRanges();
}
// this case was failing in the initial solution
// uncomment this block to see how all edge cases
// which where mentioned in other comments are handled
/*
byte[][] addresses = {
{1, 0, 0, 1},
{2, 0, 0, 2},
{2, 0, 0, 1},
{1, 2, 3, 4},
{4, 3, 2, 1},
{(byte)128, 0, 0, 0},
{(byte)255, (byte)255, (byte)255, (byte)255}
};
seenAddress(addresses[0]);
seenAddress(addresses[1]);
seenAddress(addresses[3]);
seenAddress(addresses[5]);
seenAddress(addresses[6]);
for (byte[] addressBytes : addresses) {
System.out.printf("seen %s before: %s%n",
prettyAddress(addressBytes),
seenBefore(addressBytes)
);
}
*/
processAddresses();
persistRanges();
}
/**
* Read the seen addresses from a file.
*
* @return <code>true</code> if the file was found and has the expected
* number of ranges, otherwise <code>false</code>
* @throws IOException
* @throws ClassNotFoundException
*/
private static boolean readRanges() throws IOException, ClassNotFoundException {
File rangesStore = new File("addresses.bin");
if (!rangesStore.exists()) {
return false;
}
System.out.print("found previous rangesStore... ");
try (ObjectInputStream ois = new ObjectInputStream(
new BufferedInputStream(
new FileInputStream(rangesStore), FILE_BUFFER_SIZE
)
)) {
ranges = (BitSet[]) ois.readObject();
}
if (ranges.length != RANGES_SIZE) {
System.out.printf("wrong size of rangesStore: expected %d"
+ " found: %d%n", RANGES_SIZE, ranges.length);
return false;
} else {
System.out.printf("restored ranges: %d%n", ranges.length);
return true;
}
}
/**
* Initialize the address ranges array. All address flags will be set to
* <code>false</code>.
*/
private static void initRanges() {
System.out.print("initialize new rangesStore... ");
ranges = new BitSet[RANGES_SIZE];
for (int i = 0; i < RANGES_SIZE; i++) {
BitSet bitSet = new BitSet(255 * 255 * 255 + 1);
for (int j = 0; j < 255 * 255 * 255 + 1; j++) {
bitSet.clear(j);
}
ranges[i] = bitSet;
}
System.out.printf("initialized ranges: %d%n", RANGES_SIZE);
}
/**
* For demonstration purpose.<br>
* Generates some random IPv4 addresses. If the address was not seen before
* the flag for this address will be set to <code>true</code>.
*/
private static void processAddresses() {
for (int i = 0; i < 10; i++) {
byte[] addrBytes = randomAddress();
boolean seenBefore = seenBefore(addrBytes);
if (!seenBefore) {
seenAddress(addrBytes);
seenBefore = false;
}
System.out.printf("seen %s before: %s%n",
prettyAddress(addrBytes),
seenBefore
);
}
}
/**
* Persist the address ranges array. The file size is around 500MB.
*
* @throws IOException
*/
private static void persistRanges() throws IOException {
System.out.print("persist rangesStore... ");
try (ObjectOutputStream oos = new ObjectOutputStream(
new BufferedOutputStream(
new FileOutputStream("addresses.bin"), FILE_BUFFER_SIZE)
)) {
oos.writeObject(ranges);
}
System.out.printf("written ranges: %d%n", ranges.length);
}
/**
* Keep a flag which address has been seen already.
*
* @param addrBytes IPv4 address in four bytes
*/
static void seenAddress(byte[] addrBytes) {
int rangeIndex = (int) addrBytes[0] & 0xff;
int rangeOffset = ((int) addrBytes[1] & 0xff * 0xffff)
+ ((int) addrBytes[2] & 0xff * 0xff)
+ ((int) addrBytes[3] & 0xff);
ranges[rangeIndex].set(rangeOffset);
}
/**
* Check if the passed address was seen before.
*
* @param addrBytes IPv4 address in four bytes
* @return <code>true</code> if the address was seen before, otherwise
* <code>false</code>
*/
static boolean seenBefore(byte[] addrBytes) {
int rangeIndex = (int) addrBytes[0] & 0xff;
int rangeOffset = ((int) addrBytes[1] & 0xff * 0xffff) + ((int) addrBytes[2] & 0xff * 0xff) + ((int) addrBytes[3] & 0xff);
return ranges[rangeIndex].get(rangeOffset);
}
/**
* Convert the IPv4 address into pretty string.
*
* @param addrBytes IPv4 address in four bytes
* @return pretty String of the IPv4 address
*/
static String prettyAddress(byte[] addrBytes) {
return String.format("%03d.%03d.%03d.%03d",
(int) addrBytes[0] & 0xff,
(int) addrBytes[1] & 0xff,
(int) addrBytes[2] & 0xff,
(int) addrBytes[3] & 0xff);
}
/**
* Generate a random IPv4 address.
*
* @return four bytes of a random generated IPv4 address
*/
private static byte[] randomAddress() {
byte[] bytes = new byte[4];
for (int i = 0; i < bytes.length; i++) {
bytes[i] = (byte) random.nextInt(256);
}
return bytes;
}
}
我从这里移植了另一个解决方案的代码来解决这个问题:
上述问题的答案深入解释了以下代码的工作原理。如果其他人想 post 在此线程上获得更深入的答案,我将给予答案。
static BitSet set;
static int pos(int i, int j, int k, int m) {
return ((256*256*256) * i) + ((256*256) * j) + (256 * k) + m;
}
static boolean get(byte[] addr) {
return set.get(pos(addr[0], addr[1], addr[2], addr[3]));
}
static void set(byte[] addr, boolean flag) {
set.set(pos(addr[0], addr[1], addr[2], addr[3]), flag);
}