如何通过同一个套接字连接发送图像和文本
How do I send images and text over the same socket connection
我有一个(非常丑陋的)方法可以从网站获取页面和页面上的所有图像。抓取网页完全没有问题。但是当我获取图像时,它们看起来很奇怪,而且绝对不像发送时那样。
我一直用于测试的 uri 是这样的:http://www.themountaingoats.net/contact.html 这个网页非常简单,并且拥有我测试所需的一切。
使用 \r 或 \n 作为结束符会产生不同的结果,\r\n 甚至无法打开图像。
public static String GET(String uri, int port) throws IOException {
String domain = uri.split("/",2)[0];
String filename = uri.split("/",2)[1];
Socket socket = new Socket(domain, port);
// send the command to the server.
System.out.println(socket.isConnected());
DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream());
BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream()));
String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(request);
outToServer.writeBytes(request);
//create a file to write in.
File file = new File(domain+".txt");
// if file doesnt exists, then create it
if (!file.exists()) {
file.createNewFile();
}
PrintWriter writer = new PrintWriter(file);
writer.print("");
writer.close();
int characterCounter=100;
while(characterCounter >= 0){
String serverSentence = inFromServer.readLine();
System.out.println(serverSentence);
if (serverSentence.startsWith("Content-Length:")){
characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
}
if ( !serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
&& !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
&& !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") ){
characterCounter = characterCounter - serverSentence.length()-1;
}
//write in the file
FileWriter fw = new FileWriter(file.getAbsoluteFile(),true);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(serverSentence+"\r\n");
bw.close();
}
Document doc = Jsoup.parse(file, "UTF-8");
Elements imgs = doc.getElementsByTag("img");
System.out.println(imgs);
for (Element link : imgs) {
String source = link.attr("src");
source = source.replace("http://"+domain+"", "");
System.out.println(source);
//create a file to write in.
File image = new File(source.replace("/", "."));
// if file doesnt exists, then create it
if (!image.exists()) {
image.createNewFile();
}
PrintWriter imageWriter = new PrintWriter(image);
imageWriter.print("");
imageWriter.close();
String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(requestImage);
outToServer.writeBytes(requestImage);
boolean flag = false;
String previousServerSentence = "something not empty";
characterCounter=100;
while(characterCounter > 0){
String serverSentence = inFromServer.readLine();
System.out.println(serverSentence);
if (serverSentence.startsWith("Content-Length:")){
characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
}
if (!flag){
if ( previousServerSentence.matches("") && !serverSentence.matches("")){
flag = true;
}
}
if ( (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
&& !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
&& !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") && !serverSentence.startsWith("ETag: ") && !serverSentence.startsWith("Accept-Ranges: ")
&& !serverSentence.startsWith("Accept-Language: ") && !serverSentence.startsWith("Accept-Datetime: ") && !serverSentence.startsWith("Authorization: ")
&& !serverSentence.startsWith("Connection: ") && !serverSentence.startsWith("Content-Language: ") && !serverSentence.startsWith("Content-Length: ")
&& !serverSentence.startsWith("Content-Location: ") && !serverSentence.startsWith("Content-MD5: ") && !serverSentence.startsWith("Content-Range: ")
&& !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("expect: ")
&& !serverSentence.startsWith("From: ") && !serverSentence.startsWith("Host: ") && !serverSentence.startsWith("If-Match: ") && !serverSentence.startsWith("If-Modified-Since: ")
&& !serverSentence.startsWith("Accept: ") && !serverSentence.startsWith("Accept-Charset: ") && !serverSentence.startsWith("Accept-Encoding: ")
&& !serverSentence.startsWith("Age: ") && !serverSentence.startsWith("Allow: ") && !serverSentence.startsWith("Content-Encoding: ")
&& !serverSentence.startsWith("If-None-Match: ") && !serverSentence.startsWith("If-Range: ") && !serverSentence.startsWith("If-Unmodified-Since: ")
&& !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Location: ") && !serverSentence.startsWith("Max-Forwards: ")
&& !serverSentence.startsWith("Pragma: ") && !serverSentence.startsWith("Proxy-Authenticate: ") && !serverSentence.startsWith("Proxy-Authorization: ")
&& !serverSentence.startsWith("Range: ") && !serverSentence.startsWith("Referer: ") && !serverSentence.startsWith("Retry-After: ")
&& !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("TE: ") && !serverSentence.startsWith("Trailer: ")
&& !serverSentence.startsWith("Transfer-Encoding: ") && !serverSentence.startsWith("Upgrade: ") && !serverSentence.startsWith("User-Agent: ")
&& !serverSentence.startsWith("Via: ") && !serverSentence.startsWith("Warning: ") && !serverSentence.startsWith("WWW-Authenticate: "))
&& flag){
characterCounter = characterCounter - serverSentence.length()-1;
//write in the file
FileWriter fw = new FileWriter(image.getAbsoluteFile(),true);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(serverSentence+"\r");
bw.close();
}
previousServerSentence = serverSentence;
}
}
return null;
}
第一张图片以\r为底线,第二张图片以\n为底线,最后一张为原图。我完全不知道为什么图像会变得如此糟糕。
所以我的问题是:为什么会发生这种情况,我该如何解决?
编辑:
public static String GET(String uri, int port) throws IOException {
/*
* Retrieval of the webpage
*/
String domain = uri.split("/",2)[0];
String filename = uri.split("/",2)[1];
Socket socket = new Socket(domain, port);
// send the command to the server.
System.out.println(socket.isConnected());
DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream());
BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream()));
String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(request);
outToServer.writeBytes(request);
//create a file to write in.
File file = new File(domain+".txt");
// if file doesnt exists, then create it
if (!file.exists()) {
file.createNewFile();
}
PrintWriter writer = new PrintWriter(file);
writer.print("");
writer.close();
int characterCounter=100;
while(characterCounter >= 0){
String serverSentence = inFromServer.readLine();
System.out.println(serverSentence);
if (serverSentence.startsWith("Content-Length:")){
characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
}
if ( !serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
&& !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
&& !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") ){
characterCounter = characterCounter - serverSentence.length()-1;
}
//write in the file
FileWriter fw = new FileWriter(file.getAbsoluteFile(),true);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(serverSentence+"\r\n");
bw.close();
}
/*
* Retrieval of all the embedded images on the webpage that are on the same domain.
*/
Document doc = Jsoup.parse(file, "UTF-8");
Elements imgs = doc.getElementsByTag("img");
System.out.println(imgs);
for (Element link : imgs) {
String source = link.attr("src");
source = source.replace("http://"+domain+"", "");
System.out.println(source);
//create a file to write in.
File image = new File(source.replace("/", "."));
// if file doesnt exists, then create it
if (!image.exists()) {
image.createNewFile();
}
// Initialize the streams.
final FileOutputStream fileOutputStream = new FileOutputStream(image);
final InputStream inputStream = socket.getInputStream();
// Header end flag.
boolean headerEnded = false;
String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(requestImage);
outToServer.writeBytes(requestImage);
int buffersize = 1000000;
byte[] bytes = new byte[buffersize];
int length;
while ((length = inputStream.read(bytes)) != -1) {
// If the end of the header had already been reached, write the bytes to the file as normal.
if (headerEnded){
fileOutputStream.write(bytes, 0, length);
}
// This locates the end of the header by comparing the current byte as well as the next 3 bytes
// with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10).
// If the end of the header is reached, the flag is set to true and the remaining data in the
// currently buffered byte array is written into the file.
else {
for (int i = 0; i < buffersize-3; i++) {
if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) {
headerEnded = true;
fileOutputStream.write(bytes, i+4 , buffersize-i-4);
break;
}
}
}
}
inputStream.close();
fileOutputStream.close();
}
socket.close();
return null;
}
这是我现在的结果:
我可以得到部分图片,但不能得到整张图片。使用 buffersize 让我走得更远甚至更远。
EDIT2:我发现了错误。它只是与某些维度有关。
最终工作代码:
public static String GET(String uri, int port) throws IOException {
/*
* Retrieval of the webpage
*/
String domain = uri.split("/",2)[0];
String filename = uri.split("/",2)[1];
Socket socket = new Socket(domain, port);
// send the command to the server.
System.out.println(socket.isConnected());
DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream());
BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream()));
String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(request);
outToServer.writeBytes(request);
//create a file to write in.
File file = new File(domain+".txt");
// if file doesnt exists, then create it
if (!file.exists()) {
file.createNewFile();
}
PrintWriter writer = new PrintWriter(file);
writer.print("");
writer.close();
int characterCounter=100;
while(characterCounter >= 0){
String serverSentence = inFromServer.readLine();
System.out.println(serverSentence);
if (serverSentence.startsWith("Content-Length:")){
characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
}
if ( !serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
&& !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
&& !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") ){
characterCounter = characterCounter - serverSentence.length()-1;
}
//write in the file
FileWriter fw = new FileWriter(file.getAbsoluteFile(),true);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(serverSentence+"\r\n");
bw.close();
}
/*
* Retrieval of all the embedded images on the webpage that are on the same domain.
*/
Document doc = Jsoup.parse(file, "UTF-8");
Elements imgs = doc.getElementsByTag("img");
System.out.println(imgs);
for (Element link : imgs) {
// Getting the link ready for GET query.
String source = link.attr("src");
source = source.replace("http://"+domain+"", "");
System.out.println(source);
//create a file to write in.
File image = new File(source.replace("/", "."));
// if file doesnt exists, then create it
if (!image.exists()) {
image.createNewFile();
}
String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(requestImage);
outToServer.writeBytes(requestImage);
// Initialize the streams.
final FileOutputStream fileOutputStream = new FileOutputStream(image);
final InputStream inputStream = socket.getInputStream();
// Header end flag.
boolean headerEnded = false;
int buffersize = 10000;
byte[] bytes = new byte[buffersize];
int length;
while ((length = inputStream.read(bytes)) != -1) {
// If the end of the header had already been reached, write the bytes to the file as normal.
if (headerEnded){
fileOutputStream.write(bytes, 0, length);
}
// This locates the end of the header by comparing the current byte as well as the next 3 bytes
// with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10).
// If the end of the header is reached, the flag is set to true and the remaining data in the
// currently buffered byte array is written into the file.
else {
for (int i = 0; i < length-3; i++) {
if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) {
headerEnded = true;
fileOutputStream.write(bytes, i+4 , length-i-4);
break;
}
}
}
}
inputStream.close();
fileOutputStream.close();
}
socket.close();
return null;
}
尽可能避免使用原始套接字处理 http 请求。
如果您可以使用单独的连接来检索图像文件,请参阅 4ndrew 的回答:
如果您受困于原始套接字,请避免使用 java.io.BufferedReader。 BufferedReader 不应用于读取二进制数据。您正在将 binary 数据转换为 String 并将文本文件写入本地电脑。
请参阅 Alexay 的解决方法:
我有一个(非常丑陋的)方法可以从网站获取页面和页面上的所有图像。抓取网页完全没有问题。但是当我获取图像时,它们看起来很奇怪,而且绝对不像发送时那样。 我一直用于测试的 uri 是这样的:http://www.themountaingoats.net/contact.html 这个网页非常简单,并且拥有我测试所需的一切。
使用 \r 或 \n 作为结束符会产生不同的结果,\r\n 甚至无法打开图像。
public static String GET(String uri, int port) throws IOException {
String domain = uri.split("/",2)[0];
String filename = uri.split("/",2)[1];
Socket socket = new Socket(domain, port);
// send the command to the server.
System.out.println(socket.isConnected());
DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream());
BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream()));
String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(request);
outToServer.writeBytes(request);
//create a file to write in.
File file = new File(domain+".txt");
// if file doesnt exists, then create it
if (!file.exists()) {
file.createNewFile();
}
PrintWriter writer = new PrintWriter(file);
writer.print("");
writer.close();
int characterCounter=100;
while(characterCounter >= 0){
String serverSentence = inFromServer.readLine();
System.out.println(serverSentence);
if (serverSentence.startsWith("Content-Length:")){
characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
}
if ( !serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
&& !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
&& !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") ){
characterCounter = characterCounter - serverSentence.length()-1;
}
//write in the file
FileWriter fw = new FileWriter(file.getAbsoluteFile(),true);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(serverSentence+"\r\n");
bw.close();
}
Document doc = Jsoup.parse(file, "UTF-8");
Elements imgs = doc.getElementsByTag("img");
System.out.println(imgs);
for (Element link : imgs) {
String source = link.attr("src");
source = source.replace("http://"+domain+"", "");
System.out.println(source);
//create a file to write in.
File image = new File(source.replace("/", "."));
// if file doesnt exists, then create it
if (!image.exists()) {
image.createNewFile();
}
PrintWriter imageWriter = new PrintWriter(image);
imageWriter.print("");
imageWriter.close();
String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(requestImage);
outToServer.writeBytes(requestImage);
boolean flag = false;
String previousServerSentence = "something not empty";
characterCounter=100;
while(characterCounter > 0){
String serverSentence = inFromServer.readLine();
System.out.println(serverSentence);
if (serverSentence.startsWith("Content-Length:")){
characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
}
if (!flag){
if ( previousServerSentence.matches("") && !serverSentence.matches("")){
flag = true;
}
}
if ( (!serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
&& !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
&& !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") && !serverSentence.startsWith("ETag: ") && !serverSentence.startsWith("Accept-Ranges: ")
&& !serverSentence.startsWith("Accept-Language: ") && !serverSentence.startsWith("Accept-Datetime: ") && !serverSentence.startsWith("Authorization: ")
&& !serverSentence.startsWith("Connection: ") && !serverSentence.startsWith("Content-Language: ") && !serverSentence.startsWith("Content-Length: ")
&& !serverSentence.startsWith("Content-Location: ") && !serverSentence.startsWith("Content-MD5: ") && !serverSentence.startsWith("Content-Range: ")
&& !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("expect: ")
&& !serverSentence.startsWith("From: ") && !serverSentence.startsWith("Host: ") && !serverSentence.startsWith("If-Match: ") && !serverSentence.startsWith("If-Modified-Since: ")
&& !serverSentence.startsWith("Accept: ") && !serverSentence.startsWith("Accept-Charset: ") && !serverSentence.startsWith("Accept-Encoding: ")
&& !serverSentence.startsWith("Age: ") && !serverSentence.startsWith("Allow: ") && !serverSentence.startsWith("Content-Encoding: ")
&& !serverSentence.startsWith("If-None-Match: ") && !serverSentence.startsWith("If-Range: ") && !serverSentence.startsWith("If-Unmodified-Since: ")
&& !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Location: ") && !serverSentence.startsWith("Max-Forwards: ")
&& !serverSentence.startsWith("Pragma: ") && !serverSentence.startsWith("Proxy-Authenticate: ") && !serverSentence.startsWith("Proxy-Authorization: ")
&& !serverSentence.startsWith("Range: ") && !serverSentence.startsWith("Referer: ") && !serverSentence.startsWith("Retry-After: ")
&& !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("TE: ") && !serverSentence.startsWith("Trailer: ")
&& !serverSentence.startsWith("Transfer-Encoding: ") && !serverSentence.startsWith("Upgrade: ") && !serverSentence.startsWith("User-Agent: ")
&& !serverSentence.startsWith("Via: ") && !serverSentence.startsWith("Warning: ") && !serverSentence.startsWith("WWW-Authenticate: "))
&& flag){
characterCounter = characterCounter - serverSentence.length()-1;
//write in the file
FileWriter fw = new FileWriter(image.getAbsoluteFile(),true);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(serverSentence+"\r");
bw.close();
}
previousServerSentence = serverSentence;
}
}
return null;
}
第一张图片以\r为底线,第二张图片以\n为底线,最后一张为原图。我完全不知道为什么图像会变得如此糟糕。
所以我的问题是:为什么会发生这种情况,我该如何解决?
编辑:
public static String GET(String uri, int port) throws IOException {
/*
* Retrieval of the webpage
*/
String domain = uri.split("/",2)[0];
String filename = uri.split("/",2)[1];
Socket socket = new Socket(domain, port);
// send the command to the server.
System.out.println(socket.isConnected());
DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream());
BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream()));
String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(request);
outToServer.writeBytes(request);
//create a file to write in.
File file = new File(domain+".txt");
// if file doesnt exists, then create it
if (!file.exists()) {
file.createNewFile();
}
PrintWriter writer = new PrintWriter(file);
writer.print("");
writer.close();
int characterCounter=100;
while(characterCounter >= 0){
String serverSentence = inFromServer.readLine();
System.out.println(serverSentence);
if (serverSentence.startsWith("Content-Length:")){
characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
}
if ( !serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
&& !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
&& !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") ){
characterCounter = characterCounter - serverSentence.length()-1;
}
//write in the file
FileWriter fw = new FileWriter(file.getAbsoluteFile(),true);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(serverSentence+"\r\n");
bw.close();
}
/*
* Retrieval of all the embedded images on the webpage that are on the same domain.
*/
Document doc = Jsoup.parse(file, "UTF-8");
Elements imgs = doc.getElementsByTag("img");
System.out.println(imgs);
for (Element link : imgs) {
String source = link.attr("src");
source = source.replace("http://"+domain+"", "");
System.out.println(source);
//create a file to write in.
File image = new File(source.replace("/", "."));
// if file doesnt exists, then create it
if (!image.exists()) {
image.createNewFile();
}
// Initialize the streams.
final FileOutputStream fileOutputStream = new FileOutputStream(image);
final InputStream inputStream = socket.getInputStream();
// Header end flag.
boolean headerEnded = false;
String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(requestImage);
outToServer.writeBytes(requestImage);
int buffersize = 1000000;
byte[] bytes = new byte[buffersize];
int length;
while ((length = inputStream.read(bytes)) != -1) {
// If the end of the header had already been reached, write the bytes to the file as normal.
if (headerEnded){
fileOutputStream.write(bytes, 0, length);
}
// This locates the end of the header by comparing the current byte as well as the next 3 bytes
// with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10).
// If the end of the header is reached, the flag is set to true and the remaining data in the
// currently buffered byte array is written into the file.
else {
for (int i = 0; i < buffersize-3; i++) {
if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) {
headerEnded = true;
fileOutputStream.write(bytes, i+4 , buffersize-i-4);
break;
}
}
}
}
inputStream.close();
fileOutputStream.close();
}
socket.close();
return null;
}
这是我现在的结果:
我可以得到部分图片,但不能得到整张图片。使用 buffersize 让我走得更远甚至更远。
EDIT2:我发现了错误。它只是与某些维度有关。 最终工作代码:
public static String GET(String uri, int port) throws IOException {
/*
* Retrieval of the webpage
*/
String domain = uri.split("/",2)[0];
String filename = uri.split("/",2)[1];
Socket socket = new Socket(domain, port);
// send the command to the server.
System.out.println(socket.isConnected());
DataOutputStream outToServer = new DataOutputStream(socket.getOutputStream());
BufferedReader inFromServer = new BufferedReader(new InputStreamReader(socket.getInputStream()));
String request = "GET " +"/"+ filename + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(request);
outToServer.writeBytes(request);
//create a file to write in.
File file = new File(domain+".txt");
// if file doesnt exists, then create it
if (!file.exists()) {
file.createNewFile();
}
PrintWriter writer = new PrintWriter(file);
writer.print("");
writer.close();
int characterCounter=100;
while(characterCounter >= 0){
String serverSentence = inFromServer.readLine();
System.out.println(serverSentence);
if (serverSentence.startsWith("Content-Length:")){
characterCounter = Integer.parseInt(serverSentence.replace("Content-Length: ",""));
}
if ( !serverSentence.startsWith("Cache-Control: ") && !serverSentence.startsWith("Content-Type: ") && !serverSentence.startsWith("Date: ") && !serverSentence.startsWith("Etag: ")
&& !serverSentence.startsWith("Expires: ") && !serverSentence.startsWith("Last-Modified: ") && !serverSentence.startsWith("Server: ") && !serverSentence.startsWith("Vary: ")
&& !serverSentence.startsWith("X-Cache: ") && !serverSentence.startsWith("Content-Length: ") ){
characterCounter = characterCounter - serverSentence.length()-1;
}
//write in the file
FileWriter fw = new FileWriter(file.getAbsoluteFile(),true);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(serverSentence+"\r\n");
bw.close();
}
/*
* Retrieval of all the embedded images on the webpage that are on the same domain.
*/
Document doc = Jsoup.parse(file, "UTF-8");
Elements imgs = doc.getElementsByTag("img");
System.out.println(imgs);
for (Element link : imgs) {
// Getting the link ready for GET query.
String source = link.attr("src");
source = source.replace("http://"+domain+"", "");
System.out.println(source);
//create a file to write in.
File image = new File(source.replace("/", "."));
// if file doesnt exists, then create it
if (!image.exists()) {
image.createNewFile();
}
String requestImage = "GET "+ source + " HTTP/1.1 "+"\r\n"+"Host: " + domain + "\r\n\r\n";
System.out.println(requestImage);
outToServer.writeBytes(requestImage);
// Initialize the streams.
final FileOutputStream fileOutputStream = new FileOutputStream(image);
final InputStream inputStream = socket.getInputStream();
// Header end flag.
boolean headerEnded = false;
int buffersize = 10000;
byte[] bytes = new byte[buffersize];
int length;
while ((length = inputStream.read(bytes)) != -1) {
// If the end of the header had already been reached, write the bytes to the file as normal.
if (headerEnded){
fileOutputStream.write(bytes, 0, length);
}
// This locates the end of the header by comparing the current byte as well as the next 3 bytes
// with the HTTP header end "\r\n\r\n" (which in integer representation would be 13 10 13 10).
// If the end of the header is reached, the flag is set to true and the remaining data in the
// currently buffered byte array is written into the file.
else {
for (int i = 0; i < length-3; i++) {
if (bytes[i] == 13 && bytes[i + 1] == 10 && bytes[i + 2] == 13 && bytes[i + 3] == 10) {
headerEnded = true;
fileOutputStream.write(bytes, i+4 , length-i-4);
break;
}
}
}
}
inputStream.close();
fileOutputStream.close();
}
socket.close();
return null;
}
尽可能避免使用原始套接字处理 http 请求。
如果您可以使用单独的连接来检索图像文件,请参阅 4ndrew 的回答:
如果您受困于原始套接字,请避免使用 java.io.BufferedReader。 BufferedReader 不应用于读取二进制数据。您正在将 binary 数据转换为 String 并将文本文件写入本地电脑。
请参阅 Alexay 的解决方法: