如何从 alexa.com 解析 div 标签并在 django 中的 table 中显示结果
How to parse div tag from alexa.com and show results in table in django
我已经使用 HTMLParser
和 urllib2
成功创建了一个网络应用程序,它从 www.alexa.com/topsites/global[=41= 获取前 20 个网站] 并将结果放在 HTML table 中。我的问题是我不能遵循相同的规则并为 <div class="count">
和 <div class="description">
应用相同的算法。
任何人都可以帮助我使用 BS4 而不 做一些片段吗?
到目前为止我的代码:
urlparse.py
import HTMLParser, urllib
class MyHTMLParser(HTMLParser.HTMLParser):
site_list = []
def reset(self):
HTMLParser.HTMLParser.reset(self)
self.in_a = False
self.next_link_text_pair = None
def handle_starttag(self, tag, attrs):
if tag=='a':
for name, value in attrs:
if name=='href':
self.next_link_text_pair = [value, '']
self.in_a = True
break
def handle_data(self, data):
if self.in_a: self.next_link_text_pair[1] += data
def handle_endtag(self, tag):
if tag=='a':
if self.next_link_text_pair is not None:
if self.next_link_text_pair[0].startswith('/siteinfo/'):
self.site_list.append(self.next_link_text_pair[1])
self.next_link_text_pair = None
self.in_a = False
if __name__=='__main__':
p = MyHTMLParser()
p.feed(urllib.urlopen('http://www.alexa.com/topsites/global').read())
print p.site_list[:20]
urls.py
urlpatterns = patterns('',
url(r'^$', 'myapp.views.top_urls', name='home'),
url(r'^admin/', include(admin.site.urls)),
)
views.py
def top_urls(request):
p = MyHTMLParser()
p.feed(urllib2.urlopen('http://www.alexa.com/topsites/global').read())
urls = p.site_list[:20]
print urls
return render(request, 'top_urls.html', {'urls': urls})
top_urls.html
...
<tbody>
{% for url in urls %}
<tr>
<td>Something</td><!--here should be {{rank}}-->
<td>{{ url }}</td>
<td>something</td><!--here should be {{description}}-->
</tr>
{% endfor %}
</tbody>
...
这个想法是创建某种状态机。在 starttag
事件中,我们决定稍后在 data
事件中填充站点信息的哪个数据字段。根据 <div>
/<span>
class
属性和 ATTR_FIELDS
映射做出决定。
例如,如果启动了 <div class="count">
标签,那么我们将填充当前 self.site
字典的 rank
字段。
class MyHTMLParser(HTMLParser.HTMLParser):
ATTR_FIELDS = {'count': 'rank',
'description': 'description', 'remainder': 'description'}
def reset_site(self):
self.site = {'rank': '', 'url': '', 'description': ''}
self.in_site_listing = self.data_field = False
def reset(self):
HTMLParser.HTMLParser.reset(self)
self.reset_site()
self.site_list = []
def handle_starttag(self, tag, attrs):
class_attr = dict(attrs).get('class')
if tag == 'li' and class_attr == 'site-listing':
self.in_site_listing = True
elif self.in_site_listing:
if tag == 'a':
if class_attr != 'moreDesc':
self.site['url'] = dict(attrs)['href'].replace(
'/siteinfo/', '')
elif tag in ['div', 'span']:
self.data_field = self.ATTR_FIELDS.get(class_attr)
def handle_data(self, data):
if self.data_field:
self.site[self.data_field] += data
def handle_endtag(self, tag):
if tag == 'li' and self.in_site_listing:
self.site_list.append(self.site)
self.reset_site()
self.data_field = None
然后更改视图和模板:
view.py
def top_urls(request):
p = MyHTMLParser()
p.feed(urllib2.urlopen('http://www.alexa.com/topsites/global').read())
sites = p.site_list[:20]
return render(request, 'top_urls.html', {'sites': sites})
top_urls.html
...
<tbody>
{% for site in sites %}
<tr>
<td>{{ site.rank }}</td>
<td>{{ site.url }}</td>
<td>{{ site.description }}</td>
</tr>
{% endfor %}
</tbody>
...
说明更新:
使用的变量:
self.site
- 当前站点信息
self.in_site_listing' - flag is set to True if we are in the
` 标签
self.data_field
- 输入站点信息以添加数据
ATTR_FIELDS
- <div>
/<span>
类 到站点信息键 的映射
关键方法是handle_starttag()
:
def handle_starttag(self, tag, attrs):
# get the tag `class` attribute if any
class_attr = dict(attrs).get('class')
# if the tag is `<li class="site-listing">` then set the flag that we
# should populate the site info
if tag == 'li' and class_attr == 'site-listing':
self.in_site_listing = True
# we a in the site population mode
elif self.in_site_listing:
if tag == 'a':
# `<li class="site-info">` contains two `<a>` tags. We should
# use the tag withoud `class="moreDesc"` attribute to set the url
if class_attr != 'moreDesc':
self.site['url'] = dict(attrs)['href'].replace(
'/siteinfo/', '')
elif tag in ['div', 'span']:
# we are in the `<div>` or `<span>` tag. Get the `class` attribute
# of the tag and decide which field of the site info we will
# populate in the `handle_data()` method
self.data_field = self.ATTR_FIELDS.get(class_attr)
所以 handle_data()
非常简单:
def handle_data(self, data):
# if we know which field of site info should be populated
if self.data_field:
# append the data to this field. Site description is spread in several
# tags this is why we append data instead of simple assigning.
self.site[self.data_field] += data
我已经使用 HTMLParser
和 urllib2
成功创建了一个网络应用程序,它从 www.alexa.com/topsites/global[=41= 获取前 20 个网站] 并将结果放在 HTML table 中。我的问题是我不能遵循相同的规则并为 <div class="count">
和 <div class="description">
应用相同的算法。
任何人都可以帮助我使用 BS4 而不 做一些片段吗?
到目前为止我的代码:
urlparse.py
import HTMLParser, urllib
class MyHTMLParser(HTMLParser.HTMLParser):
site_list = []
def reset(self):
HTMLParser.HTMLParser.reset(self)
self.in_a = False
self.next_link_text_pair = None
def handle_starttag(self, tag, attrs):
if tag=='a':
for name, value in attrs:
if name=='href':
self.next_link_text_pair = [value, '']
self.in_a = True
break
def handle_data(self, data):
if self.in_a: self.next_link_text_pair[1] += data
def handle_endtag(self, tag):
if tag=='a':
if self.next_link_text_pair is not None:
if self.next_link_text_pair[0].startswith('/siteinfo/'):
self.site_list.append(self.next_link_text_pair[1])
self.next_link_text_pair = None
self.in_a = False
if __name__=='__main__':
p = MyHTMLParser()
p.feed(urllib.urlopen('http://www.alexa.com/topsites/global').read())
print p.site_list[:20]
urls.py
urlpatterns = patterns('',
url(r'^$', 'myapp.views.top_urls', name='home'),
url(r'^admin/', include(admin.site.urls)),
)
views.py
def top_urls(request):
p = MyHTMLParser()
p.feed(urllib2.urlopen('http://www.alexa.com/topsites/global').read())
urls = p.site_list[:20]
print urls
return render(request, 'top_urls.html', {'urls': urls})
top_urls.html
...
<tbody>
{% for url in urls %}
<tr>
<td>Something</td><!--here should be {{rank}}-->
<td>{{ url }}</td>
<td>something</td><!--here should be {{description}}-->
</tr>
{% endfor %}
</tbody>
...
这个想法是创建某种状态机。在 starttag
事件中,我们决定稍后在 data
事件中填充站点信息的哪个数据字段。根据 <div>
/<span>
class
属性和 ATTR_FIELDS
映射做出决定。
例如,如果启动了 <div class="count">
标签,那么我们将填充当前 self.site
字典的 rank
字段。
class MyHTMLParser(HTMLParser.HTMLParser):
ATTR_FIELDS = {'count': 'rank',
'description': 'description', 'remainder': 'description'}
def reset_site(self):
self.site = {'rank': '', 'url': '', 'description': ''}
self.in_site_listing = self.data_field = False
def reset(self):
HTMLParser.HTMLParser.reset(self)
self.reset_site()
self.site_list = []
def handle_starttag(self, tag, attrs):
class_attr = dict(attrs).get('class')
if tag == 'li' and class_attr == 'site-listing':
self.in_site_listing = True
elif self.in_site_listing:
if tag == 'a':
if class_attr != 'moreDesc':
self.site['url'] = dict(attrs)['href'].replace(
'/siteinfo/', '')
elif tag in ['div', 'span']:
self.data_field = self.ATTR_FIELDS.get(class_attr)
def handle_data(self, data):
if self.data_field:
self.site[self.data_field] += data
def handle_endtag(self, tag):
if tag == 'li' and self.in_site_listing:
self.site_list.append(self.site)
self.reset_site()
self.data_field = None
然后更改视图和模板:
view.py
def top_urls(request):
p = MyHTMLParser()
p.feed(urllib2.urlopen('http://www.alexa.com/topsites/global').read())
sites = p.site_list[:20]
return render(request, 'top_urls.html', {'sites': sites})
top_urls.html
...
<tbody>
{% for site in sites %}
<tr>
<td>{{ site.rank }}</td>
<td>{{ site.url }}</td>
<td>{{ site.description }}</td>
</tr>
{% endfor %}
</tbody>
...
说明更新:
使用的变量:
self.site
- 当前站点信息self.in_site_listing' - flag is set to True if we are in the
` 标签self.data_field
- 输入站点信息以添加数据ATTR_FIELDS
-<div>
/<span>
类 到站点信息键 的映射
关键方法是handle_starttag()
:
def handle_starttag(self, tag, attrs):
# get the tag `class` attribute if any
class_attr = dict(attrs).get('class')
# if the tag is `<li class="site-listing">` then set the flag that we
# should populate the site info
if tag == 'li' and class_attr == 'site-listing':
self.in_site_listing = True
# we a in the site population mode
elif self.in_site_listing:
if tag == 'a':
# `<li class="site-info">` contains two `<a>` tags. We should
# use the tag withoud `class="moreDesc"` attribute to set the url
if class_attr != 'moreDesc':
self.site['url'] = dict(attrs)['href'].replace(
'/siteinfo/', '')
elif tag in ['div', 'span']:
# we are in the `<div>` or `<span>` tag. Get the `class` attribute
# of the tag and decide which field of the site info we will
# populate in the `handle_data()` method
self.data_field = self.ATTR_FIELDS.get(class_attr)
所以 handle_data()
非常简单:
def handle_data(self, data):
# if we know which field of site info should be populated
if self.data_field:
# append the data to this field. Site description is spread in several
# tags this is why we append data instead of simple assigning.
self.site[self.data_field] += data