提取相关链接并将其存储为 .csv 文件

Extract relevant links and storing them as .csv file

import urllib2
from datetime import datetime
from bs4 import BeautifulSoup


page1 = urllib2.urlopen("http://en.wikipedia.org/wiki/List_of_human_stampedes")
soup = BeautifulSoup(page1)

events = soup.find('span', id='20th_century').parent.find_next_sibling('ul')
for event in events.find_all('li'):
    try:
        date_string, rest = event.text.split(':', 1)
        print datetime.strptime(date_string, '%B %d, %Y').strftime('%d/%m/%Y')
    except ValueError:
        print event.text

使用上述方法我可以从

  • 标签中提取日期。我也希望提取引用 links。问题是每个
  • 标签都有很多 link。尽管 cite 已经 class 定义了“cite:。我仍然无法获得完整的 link。 我最终将这些存储为 table,其中每一行都包含日期并引用 link。 (以 .csv 格式)。 参考问题-

  • 您可以从这里开始。它以下列行格式创建一个 csv 文件:

    date,link
    

    如果提取日期组件时出错,它将跳过一行。目前,为了举例,它适用于“20 世纪”段落:

    import csv
    import urllib2
    from datetime import datetime
    from urlparse import urljoin
    from bs4 import BeautifulSoup
    
    base_url = 'http://en.wikipedia.org'
    page = urllib2.urlopen("http://en.wikipedia.org/wiki/List_of_human_stampedes")
    soup = BeautifulSoup(page)
    
    # build a list of references
    references = {}
    for item in soup.select('ol.references li[id]'):
        links = [a['href'] if a['href'].startswith('http') else urljoin(base_url, a['href'])
                 for a in item.select('span.reference-text a[href]')]
        references[item['id']] = links
    
    
    events = soup.find('span', id='20th_century').parent.find_next_siblings()
    with open('output.csv', 'wb') as f:
        writer = csv.writer(f)
        for tag in events:
            if tag.name == 'h2':
                break
    
            for event in tag.find_all('li'):
                # extract text
                try:
                    date_string, _ = event.text.split(':', 1)
                    date = datetime.strptime(date_string, '%B %d, %Y').strftime('%d/%m/%Y')
                except ValueError:
                    continue
    
                # extract links and write data
                links = event.find_all('a', href=lambda x: x.startswith('#cite_note-'))
                if links:
                    for link in links:
                        for ref in references[link['href'][1:]]:
                            writer.writerow([date, ref])
                else:
                    writer.writerow([date, ''])
    

    output.csv 在 运行 脚本之后:

    19/09/1902,
    30/12/1903,
    11/01/1908,
    24/12/1913,
    23/10/1942,http://www.ferrovieinrete.com/doc_storici/GalleriaGrazie.pdf
    09/03/1946,
    01/01/1956,
    02/01/1971,
    ...