Yahoo 的 YQL 是否有其他选项可以从其他网站提取 HTML
Is there other options for Yahoo's YQL for extracting HTML from other websites
在我的应用程序中,我使用 Yahoo 的 YQL API
从其他网站提取 HTML
,但 yahoo 停止了 API 和 Yahoo 的 YQL API
提取 HTML
将不再有效。
{
"query": {
"count": 0,
"created": "2017-06-26T12:57:49Z",
"lang": "en-US",
"meta": {
"message": "html table is no longer supported. See https://policies.yahoo.com/us/en/yahoo/terms/product-atos/yql/index.htm for YQL Terms of Use"
},
"results": null
}
}
到目前为止我是这样做的:
$(function () {
var fileFieldId;
var fileFieldClass;
var query;
var apiUrl;
$(".data-from-url").keyup(function () {
fileFieldId = $(this).attr('id');
fileFieldClass = $(this).attr('class');
fileFieldVal = $(this).val();
query = 'select * from html where url="' + $(this).val() + '" and xpath="*"';
apiUrl = 'https://query.yahooapis.com/v1/public/yql?q=' + encodeURIComponent(query);
$.get(apiUrl, function(data) {
var html = $(data).find('html');
$("input.post[data-title='" + fileFieldId + "']" ).val(html.find("meta[property='og:title']").attr('content') || 'no title found');
$("textarea.post-description[data-description='" + fileFieldId + "']" ).val(html.find("meta[property='og:description']").attr('content') || 'no title found');
$("input.post-remote-image[data-img='" + fileFieldId + "']" ).val(html.find("meta[property='og:image']").attr('content') || '');
});
});
Here is a jsfiddle for call I am doing
$(function () {
var query;
var apiUrl;
$("button.click").click(function () {
//query = 'select * from htmlstring where url="' + $(this).val() + '" and xpath="//a"&format=json&env=store://datatables.org/alltableswithkeys&callback=';
apiUrl = "https://query.yahooapis.com/v1/public/yql?q=select * from htmlstring where url='http://whosebug.com/'&format=json&diagnostics=true&env=store://datatables.org/alltableswithkeys&callback=";
$('p.extract').toggle();
$.get(apiUrl, function(data) {
$('p.extract').addClass('none');
var html = $(data).find('html');
$("input.title" ).val(html.find("meta[property='og:title']").attr('content') || 'no title found');
$("textarea.description").val(html.find("meta[property='og:description']").attr('content') || 'no title found');
$("input.image").val(html.find("meta[property='og:image']").attr('content') || '');
});
});
});
input {
width: 100%;
margin-bottom: 20px;
padding: 10px;
}
.none{display:none;}
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<button class="click">Click Me</button>
<br>
<p class="extract" style="display:none;">Extracting html</p>
<input type="text" class="title">
<br>
<textarea name="" id="" cols="30" rows="5" class="description"></textarea>
<br>
<input type="text" class="image">
是否有其他方法可以从其他站点 head
中提取 HTML meta
?
使用 YQL
提取 HTML
http://developer.yahoo.com/yql/console/?q=select%20*%20from%20htmlstring%20where%20url%3D'YOUR_ENCODED_URL_HERE'&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys
示例
http://developer.yahoo.com/yql/console/?q=select%20*%20from%20htmlstring%20where%20url%3D'http%3A%2F%2Fwhosebug.com%2F'&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys
REST 查询
https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20htmlstring%20where%20url%3D'http%3A%2F%2Fwhosebug.com%2F'&format=json&diagnostics=true&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys&callback=
htmlstring is a part of community Open Data tables.
您或许可以使用查询选择器读取元标记?我使用 fetch 来抓取 google 文档,这些文档在 html 元标记中包含所有文档属性。然后我将 html 放入一个临时对象中,我可以在我认为合适的时候使用 queryselector 命中它。类似于:
var url = "https://docs.google.com/presentation/d/1blSsU5LHnrjSjb7voHXkRA_NlWo3yNjLiyttmoWfslM/edit#slide=id.gcb9a0b074_1_0"
var id = url.split("://")[1].split("/")[3];
var source = "https://docs.google.com/presentation/d/" + id + "/edit?usp=sharing";
fetch(source).then(function(response) {
return response.text();
}).then(function(html) {
var doc = document.implementation.createHTMLDocument("foo");
doc.documentElement.innerHTML = html;
return doc.querySelector("meta[property='og:description']").getAttribute("content");
}).then(function(title) {
console.log("document title", title);
});
在我的应用程序中,我使用 Yahoo 的 YQL API
从其他网站提取 HTML
,但 yahoo 停止了 API 和 Yahoo 的 YQL API
提取 HTML
将不再有效。
{
"query": {
"count": 0,
"created": "2017-06-26T12:57:49Z",
"lang": "en-US",
"meta": {
"message": "html table is no longer supported. See https://policies.yahoo.com/us/en/yahoo/terms/product-atos/yql/index.htm for YQL Terms of Use"
},
"results": null
}
}
到目前为止我是这样做的:
$(function () {
var fileFieldId;
var fileFieldClass;
var query;
var apiUrl;
$(".data-from-url").keyup(function () {
fileFieldId = $(this).attr('id');
fileFieldClass = $(this).attr('class');
fileFieldVal = $(this).val();
query = 'select * from html where url="' + $(this).val() + '" and xpath="*"';
apiUrl = 'https://query.yahooapis.com/v1/public/yql?q=' + encodeURIComponent(query);
$.get(apiUrl, function(data) {
var html = $(data).find('html');
$("input.post[data-title='" + fileFieldId + "']" ).val(html.find("meta[property='og:title']").attr('content') || 'no title found');
$("textarea.post-description[data-description='" + fileFieldId + "']" ).val(html.find("meta[property='og:description']").attr('content') || 'no title found');
$("input.post-remote-image[data-img='" + fileFieldId + "']" ).val(html.find("meta[property='og:image']").attr('content') || '');
});
});
Here is a jsfiddle for call I am doing
$(function () {
var query;
var apiUrl;
$("button.click").click(function () {
//query = 'select * from htmlstring where url="' + $(this).val() + '" and xpath="//a"&format=json&env=store://datatables.org/alltableswithkeys&callback=';
apiUrl = "https://query.yahooapis.com/v1/public/yql?q=select * from htmlstring where url='http://whosebug.com/'&format=json&diagnostics=true&env=store://datatables.org/alltableswithkeys&callback=";
$('p.extract').toggle();
$.get(apiUrl, function(data) {
$('p.extract').addClass('none');
var html = $(data).find('html');
$("input.title" ).val(html.find("meta[property='og:title']").attr('content') || 'no title found');
$("textarea.description").val(html.find("meta[property='og:description']").attr('content') || 'no title found');
$("input.image").val(html.find("meta[property='og:image']").attr('content') || '');
});
});
});
input {
width: 100%;
margin-bottom: 20px;
padding: 10px;
}
.none{display:none;}
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<button class="click">Click Me</button>
<br>
<p class="extract" style="display:none;">Extracting html</p>
<input type="text" class="title">
<br>
<textarea name="" id="" cols="30" rows="5" class="description"></textarea>
<br>
<input type="text" class="image">
是否有其他方法可以从其他站点 head
中提取 HTML meta
?
使用 YQL
提取 HTMLhttp://developer.yahoo.com/yql/console/?q=select%20*%20from%20htmlstring%20where%20url%3D'YOUR_ENCODED_URL_HERE'&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys
示例
http://developer.yahoo.com/yql/console/?q=select%20*%20from%20htmlstring%20where%20url%3D'http%3A%2F%2Fwhosebug.com%2F'&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys
REST 查询
https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20htmlstring%20where%20url%3D'http%3A%2F%2Fwhosebug.com%2F'&format=json&diagnostics=true&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys&callback=
htmlstring is a part of community Open Data tables.
您或许可以使用查询选择器读取元标记?我使用 fetch 来抓取 google 文档,这些文档在 html 元标记中包含所有文档属性。然后我将 html 放入一个临时对象中,我可以在我认为合适的时候使用 queryselector 命中它。类似于:
var url = "https://docs.google.com/presentation/d/1blSsU5LHnrjSjb7voHXkRA_NlWo3yNjLiyttmoWfslM/edit#slide=id.gcb9a0b074_1_0"
var id = url.split("://")[1].split("/")[3];
var source = "https://docs.google.com/presentation/d/" + id + "/edit?usp=sharing";
fetch(source).then(function(response) {
return response.text();
}).then(function(html) {
var doc = document.implementation.createHTMLDocument("foo");
doc.documentElement.innerHTML = html;
return doc.querySelector("meta[property='og:description']").getAttribute("content");
}).then(function(title) {
console.log("document title", title);
});