使用 python 从 html 中提取联系信息
extract contact information from html with python
这是一个示例html
<div class="yui3-u-5-6" id="browse-products">
<div id="kazbah-contact">
<span class="contact-info-title">Contact 00Nothing:</span>
<a href="mailto:info@00nothing.com">info@00nothing.com</a> | 800-410-2074
| C/O Score X Score
8118-D Statesville Rd
,
Charlotte,
NC
28269
</div>
<div class="clearfix"></div>
我想在此处提取联系信息、电子邮件、phone 和地址。
我应该如何使用 python 来做到这一点?谢谢
我用这段代码提取信息
# _*_ coding:utf-8 _*_
import urllib2
import urllib
import re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def grabHref(url,localfile):
html = urllib2.urlopen(url).read()
html = unicode(html,'gb2312','ignore').encode('utf-8','ignore')
soup = BeautifulSoup(html)
myfile = open(localfile,'wb')
for link in soup.select("div > a[href^=http://www.karmaloop.com/kazbah/browse]"):
for item in BeautifulSoup(urllib2.urlopen(link['href']).read()).select("div > a[href^=mailto]"):
contactInfo = item.get_text()
print link['href']
print contactInfo
myfile.write(link['href'])
myfile.write('\r\n')
myfile.write(contactInfo)
myfile.write('\r\n')
myfile.close()
def main():
url = "http://www.karmaloop.com/brands"
localfile = 'Contact.txt'
grabHref(url,localfile)
if __name__=="__main__":
main()
但我仍然只能在这里获取电子邮件地址,如何获取phone号码和地址?谢谢
这是一个示例html
<div class="yui3-u-5-6" id="browse-products">
<div id="kazbah-contact">
<span class="contact-info-title">Contact 00Nothing:</span>
<a href="mailto:info@00nothing.com">info@00nothing.com</a> | 800-410-2074
| C/O Score X Score
8118-D Statesville Rd
,
Charlotte,
NC
28269
</div>
<div class="clearfix"></div>
我想在此处提取联系信息、电子邮件、phone 和地址。 我应该如何使用 python 来做到这一点?谢谢
我用这段代码提取信息
# _*_ coding:utf-8 _*_
import urllib2
import urllib
import re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def grabHref(url,localfile):
html = urllib2.urlopen(url).read()
html = unicode(html,'gb2312','ignore').encode('utf-8','ignore')
soup = BeautifulSoup(html)
myfile = open(localfile,'wb')
for link in soup.select("div > a[href^=http://www.karmaloop.com/kazbah/browse]"):
for item in BeautifulSoup(urllib2.urlopen(link['href']).read()).select("div > a[href^=mailto]"):
contactInfo = item.get_text()
print link['href']
print contactInfo
myfile.write(link['href'])
myfile.write('\r\n')
myfile.write(contactInfo)
myfile.write('\r\n')
myfile.close()
def main():
url = "http://www.karmaloop.com/brands"
localfile = 'Contact.txt'
grabHref(url,localfile)
if __name__=="__main__":
main()
但我仍然只能在这里获取电子邮件地址,如何获取phone号码和地址?谢谢