无法使用 Android 个意图从互联网上抓取数据
Unable to scrape data from Internet using Android intents
我无法使用 Android
中的 intents
从网页中抓取标题。目前,我只想从 URL 中提取标题文本(h1 标签文本)。我在 onHandleIntent(Intent intent)
中写了一段用于提取标题文本的代码,但我认为我做错了什么。我的基本目的是在 Android 上写一个网络爬虫。有人能帮我吗?这是我的代码
MainActivity.java class
package com.example.awais.check2;
import android.app.Activity;
import android.content.BroadcastReceiver;
import android.content.Context;
import android.content.Intent;
import android.content.IntentFilter;
import android.os.Bundle;
import android.view.View;
import android.widget.Button;
public class MainActivity extends Activity {
private MyWebRequestReceiver receiver;
@Override
public void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
IntentFilter filter = new IntentFilter(MyWebRequestReceiver.PROCESS_RESPONSE);
filter.addCategory(Intent.CATEGORY_DEFAULT);
receiver = new MyWebRequestReceiver();
registerReceiver(receiver, filter);
Button addButton = (Button) findViewById(R.id.sendRequest);
addButton.setOnClickListener(new View.OnClickListener() {
public void onClick(View view) {
System.out.println("Clicked");
Intent msgIntent = new Intent(MainActivity.this, MyWebRequest.class);
msgIntent.putExtra(MyWebRequest.REQUEST_STRING, "http://tribune.com.pk");
startService(msgIntent);
}
});
}
@Override
public void onDestroy() {
this.unregisterReceiver(receiver);
super.onDestroy();
}
public class MyWebRequestReceiver extends BroadcastReceiver{
public static final String PROCESS_RESPONSE = "com.example.check.intent.action.PROCESS_RESPONSE";
@Override
public void onReceive(Context context, Intent intent) {
String responseString = intent.getStringExtra(MyWebRequest.RESPONSE_STRING);
String reponseMessage = intent.getStringExtra(MyWebRequest.RESPONSE_MESSAGE);
System.out.println(reponseMessage);
System.out.println(responseString);
}
}
}
MyWebRequest.javaclass代码
package com.example.awais.check2;
import android.app.IntentService;
import android.content.Intent;
import android.os.SystemClock;
import android.text.format.DateFormat;
import android.util.Log;
import com.example.awais.check2.MainActivity.MyWebRequestReceiver;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
public class MyWebRequest extends IntentService {
public static final String REQUEST_STRING = "myRequest";
public static final String RESPONSE_STRING = "myResponse";
public static final String RESPONSE_MESSAGE = "myResponseMessage";
private String URL = null;
public MyWebRequest() {
super("MyWebRequest");
}
@Override
protected void onHandleIntent(Intent intent) {
String requestString = intent.getStringExtra(REQUEST_STRING);
String responseString = requestString + " " + DateFormat.format("MM/dd/yy h:mmaa", System.currentTimeMillis());
String responseMessage = "";
try {
Document doc;
URL = requestString;
doc = Jsoup.connect(URL).timeout(20 * 1000).get();
// System.out.println(doc.text());
Elements links = doc.select("h1");
responseMessage = links.text();
// System.out.println(links.text());
} catch (IOException e) {
responseMessage = e.getMessage();
}
Intent broadcastIntent = new Intent();
broadcastIntent.setAction(MyWebRequestReceiver.PROCESS_RESPONSE);
broadcastIntent.addCategory(Intent.CATEGORY_DEFAULT);
broadcastIntent.putExtra(RESPONSE_STRING, responseString);
broadcastIntent.putExtra(RESPONSE_MESSAGE, responseMessage);
sendBroadcast(broadcastIntent);
}
}
Content.xml代码
<?xml version="1.0" encoding="utf-8"?>
<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
android:orientation="vertical" android:layout_width="fill_parent"
android:layout_height="fill_parent"
android:paddingLeft="10dp">
<Button android:text="Send Request " android:id="@+id/sendRequest"
android:layout_width="wrap_content" android:layout_height="wrap_content" />
</LinearLayout>
您是否考虑过为 Android 使用 http 框架?它的代码少了很多,并且还在后台运行请求。此示例使用 loopj async client
build.gradle:
compile 'com.loopj.android:android-async-http:1.4.9'
compile 'cz.msebera.android:httpclient:4.4.1.2'
测试代码:
@Test
public void parseHttp() throws Exception {
AsyncHttpClient client = new AsyncHttpClient();
final CountDownLatch latch = new CountDownLatch(1);
String url = "";
client.get(url, new AsyncHttpResponseHandler(Looper.getMainLooper()) {
@Override
public void onSuccess(int statusCode, Header[] headers, byte[] responseBody) {
String body = new String(responseBody);
Pattern p = Pattern.compile("<h1(.*)<\/h1>");
Matcher m = p.matcher(body);
Log.d("tag", "success");
if ( m.find() ) {
String match = m.group(1);
Log.d("tag", match);
}
latch.countDown();
}
@Override
public void onFailure(int statusCode, Header[] headers, byte[] responseBody, Throwable error) {
Log.d("tag", "failure");
latch.countDown();
}
});
latch.await();
}
更新:
在上面添加了额外的导入:compile 'cz.msebera.android:httpclient:4.4.1.2'
import cz.msebera.android.httpclient.Header;
import cz.msebera.android.httpclient.HttpRequest;
import cz.msebera.android.httpclient.HttpResponse;
import cz.msebera.android.httpclient.HttpStatus;
我无法使用 Android
中的 intents
从网页中抓取标题。目前,我只想从 URL 中提取标题文本(h1 标签文本)。我在 onHandleIntent(Intent intent)
中写了一段用于提取标题文本的代码,但我认为我做错了什么。我的基本目的是在 Android 上写一个网络爬虫。有人能帮我吗?这是我的代码
MainActivity.java class
package com.example.awais.check2;
import android.app.Activity;
import android.content.BroadcastReceiver;
import android.content.Context;
import android.content.Intent;
import android.content.IntentFilter;
import android.os.Bundle;
import android.view.View;
import android.widget.Button;
public class MainActivity extends Activity {
private MyWebRequestReceiver receiver;
@Override
public void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
IntentFilter filter = new IntentFilter(MyWebRequestReceiver.PROCESS_RESPONSE);
filter.addCategory(Intent.CATEGORY_DEFAULT);
receiver = new MyWebRequestReceiver();
registerReceiver(receiver, filter);
Button addButton = (Button) findViewById(R.id.sendRequest);
addButton.setOnClickListener(new View.OnClickListener() {
public void onClick(View view) {
System.out.println("Clicked");
Intent msgIntent = new Intent(MainActivity.this, MyWebRequest.class);
msgIntent.putExtra(MyWebRequest.REQUEST_STRING, "http://tribune.com.pk");
startService(msgIntent);
}
});
}
@Override
public void onDestroy() {
this.unregisterReceiver(receiver);
super.onDestroy();
}
public class MyWebRequestReceiver extends BroadcastReceiver{
public static final String PROCESS_RESPONSE = "com.example.check.intent.action.PROCESS_RESPONSE";
@Override
public void onReceive(Context context, Intent intent) {
String responseString = intent.getStringExtra(MyWebRequest.RESPONSE_STRING);
String reponseMessage = intent.getStringExtra(MyWebRequest.RESPONSE_MESSAGE);
System.out.println(reponseMessage);
System.out.println(responseString);
}
}
}
MyWebRequest.javaclass代码
package com.example.awais.check2;
import android.app.IntentService;
import android.content.Intent;
import android.os.SystemClock;
import android.text.format.DateFormat;
import android.util.Log;
import com.example.awais.check2.MainActivity.MyWebRequestReceiver;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
public class MyWebRequest extends IntentService {
public static final String REQUEST_STRING = "myRequest";
public static final String RESPONSE_STRING = "myResponse";
public static final String RESPONSE_MESSAGE = "myResponseMessage";
private String URL = null;
public MyWebRequest() {
super("MyWebRequest");
}
@Override
protected void onHandleIntent(Intent intent) {
String requestString = intent.getStringExtra(REQUEST_STRING);
String responseString = requestString + " " + DateFormat.format("MM/dd/yy h:mmaa", System.currentTimeMillis());
String responseMessage = "";
try {
Document doc;
URL = requestString;
doc = Jsoup.connect(URL).timeout(20 * 1000).get();
// System.out.println(doc.text());
Elements links = doc.select("h1");
responseMessage = links.text();
// System.out.println(links.text());
} catch (IOException e) {
responseMessage = e.getMessage();
}
Intent broadcastIntent = new Intent();
broadcastIntent.setAction(MyWebRequestReceiver.PROCESS_RESPONSE);
broadcastIntent.addCategory(Intent.CATEGORY_DEFAULT);
broadcastIntent.putExtra(RESPONSE_STRING, responseString);
broadcastIntent.putExtra(RESPONSE_MESSAGE, responseMessage);
sendBroadcast(broadcastIntent);
}
}
Content.xml代码
<?xml version="1.0" encoding="utf-8"?>
<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
android:orientation="vertical" android:layout_width="fill_parent"
android:layout_height="fill_parent"
android:paddingLeft="10dp">
<Button android:text="Send Request " android:id="@+id/sendRequest"
android:layout_width="wrap_content" android:layout_height="wrap_content" />
</LinearLayout>
您是否考虑过为 Android 使用 http 框架?它的代码少了很多,并且还在后台运行请求。此示例使用 loopj async client
build.gradle:
compile 'com.loopj.android:android-async-http:1.4.9'
compile 'cz.msebera.android:httpclient:4.4.1.2'
测试代码:
@Test
public void parseHttp() throws Exception {
AsyncHttpClient client = new AsyncHttpClient();
final CountDownLatch latch = new CountDownLatch(1);
String url = "";
client.get(url, new AsyncHttpResponseHandler(Looper.getMainLooper()) {
@Override
public void onSuccess(int statusCode, Header[] headers, byte[] responseBody) {
String body = new String(responseBody);
Pattern p = Pattern.compile("<h1(.*)<\/h1>");
Matcher m = p.matcher(body);
Log.d("tag", "success");
if ( m.find() ) {
String match = m.group(1);
Log.d("tag", match);
}
latch.countDown();
}
@Override
public void onFailure(int statusCode, Header[] headers, byte[] responseBody, Throwable error) {
Log.d("tag", "failure");
latch.countDown();
}
});
latch.await();
}
更新:
在上面添加了额外的导入:compile 'cz.msebera.android:httpclient:4.4.1.2'
import cz.msebera.android.httpclient.Header;
import cz.msebera.android.httpclient.HttpRequest;
import cz.msebera.android.httpclient.HttpResponse;
import cz.msebera.android.httpclient.HttpStatus;