无法覆盖 Python 中的 CSV 文件

Cannot overwrite CSV file in Python

我已经在这里搜索了答案并在google上花了很长时间,但没有...

我已经尝试使用 'w' 而不是 'r' 或 'a' 打开文件,但我仍然无法让我的代码覆盖我最初编写的当前结果到 CSV 文件。我基本上是从网站上抓取信息,我想先搜索一个术语,抓取该数据,将其保存到 CSV 文件中,然后搜索另一个术语并抓取该数据并用新数据覆盖当前的 CSV 文件.

#!/usr/bin/python3
#from pyvirtualdisplay import Display

import csv
from bs4 import BeautifulSoup
import urllib.request


def getPageSource(current_page):

    hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
    req = urllib.request.Request(current_page, headers=hdr)
    page = urllib.request.urlopen(req)

    soup = BeautifulSoup(page, "html5lib")
    return(soup)


def get_length(file_path):
    with open("data.csv", 'r', encoding='utf8') as csvfile:
        reader = csv.reader(csvfile)
        reader_list = list(reader)
        return len(reader_list)


def write_data(file_path, company_name, role, full_url, date):
    fieldnames = ['ID', 'Company','Role', 'URL', 'Date']
    next_id = get_length(file_path)
    with open(file_path, "w", encoding='utf8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        #writer.writeheader()
        writer.writerow({
        "ID": next_id,
        "Company": company_name,
        "Role": role,
            "URL": full_url,
                "Date": date
            })
        csvfile.close()



def find_data(source):
    base_url = 'https://www.irishjobs.ie'

    for a in source.find_all(attrs={"itemtype" : "https://schema.org/JobPosting"}):
        job_info = a.find('h2').find('a')
        company_name = a.find('h3').find('a').get_text()
        url = job_info['href']
        full_url = (base_url + url)
        role = (job_info.get_text())
        date = a.find('li',class_='updated-time').get_text().replace('Updated','').strip()
        write_data("data.csv", company_name, role, full_url, date)


if __name__ == '__main__':

    query = input('Enter role to search: ')
    source = getPageSource('https://www.irishjobs.ie/ShowResults.aspx?Keywords='+query+'&Location=102&Category=3&Recruiter=All&SortBy=MostRecent&PerPage=100')

    find_data(source)

在完成写入之前,您需要保持文件打开。此外,记录写入的行数(使用 enumerate())比继续尝试读回文件更容易:

import csv
from bs4 import BeautifulSoup
import urllib.request


def getPageSource(current_page):

    hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
    req = urllib.request.Request(current_page, headers=hdr)
    page = urllib.request.urlopen(req)

    return (BeautifulSoup(page, "html5lib"))


def find_data(source):
    base_url = 'https://www.irishjobs.ie'
    fieldnames = ['ID', 'Company','Role', 'URL', 'Date']

    with open('data.csv', 'w', encoding='utf8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(fieldnames)

        for id, a in enumerate(source.find_all(attrs={"itemtype" : "https://schema.org/JobPosting"}), start=1):
            job_info = a.find('h2').find('a')
            company_name = a.find('h3').find('a').get_text()
            url = job_info['href']
            full_url = (base_url + url)
            role = (job_info.get_text())
            date = a.find('li',class_='updated-time').get_text().replace('Updated','').strip()
            writer.writerow([id, company_name, role, full_url, date])


if __name__ == '__main__':
    query = input('Enter role to search: ')
    source = getPageSource('https://www.irishjobs.ie/ShowResults.aspx?Keywords='+query+'&Location=102&Category=3&Recruiter=All&SortBy=MostRecent&PerPage=100')

    find_data(source)    

这会给你 data.csv 开始:

ID,Company,Role,URL,Date
1,Computer Futures,Xamarin Developer,https://www.irishjobs.ie/Jobs/Xamarin-Developer-8143810.aspx,06/03/2018
2,Wallace Myers International,New Business Development Manager,https://www.irishjobs.ie/Jobs/New-Business-Development-Manager-8143989.aspx,06/03/2018
3,Reperio Human Capital Ltd,Senior Software Developer - Dublin,https://www.irishjobs.ie/Jobs/Senior-Software-Developer-Dublin-8150128.aspx,20/03/2018

在你的情况下,使用普通的 csv.writer() 而不是 Dictwriter() 可能更容易。