无法将 pandas 数据框存储为 csv
Unable to store pandas data frame as a csv
我关注这个 tutorial 是为了从新闻站点检索数据。
主要功能是getDailyNews。它会在每个新闻源上循环,请求 api,提取数据并将其转储到 pandas DataFrame,然后将结果导出到 csv 文件。
但是当我 运行 代码时,出现错误。
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from tqdm import tqdm, tqdm_notebook
from functools import reduce
def getSources():
source_url = 'https://newsapi.org/v1/sources?language=en'
response = requests.get(source_url).json()
sources = []
for source in response['sources']:
sources.append(source['id'])
return sources
def mapping():
d = {}
response = requests.get('https://newsapi.org/v1/sources?language=en')
response = response.json()
for s in response['sources']:
d[s['id']] = s['category']
return d
def category(source, m):
try:
return m[source]
except:
return 'NC'
def getDailyNews():
sources = getSources()
key = '96f279e1b7f845669089abc016e915cc'
url = 'https://newsapi.org/v1/articles?source={0}&sortBy={1}&apiKey={2}'
responses = []
for i, source in tqdm_notebook(enumerate(sources), total=len(sources)):
try:
u = url.format(source, 'top', key)
except:
u = url.format(source, 'latest', key)
response = requests.get(u)
r = response.json()
try:
for article in r['articles']:
article['source'] = source
responses.append(r)
except:
print('Rate limit exceeded ... please wait and retry in 6 hours')
return None
articles = list(map(lambda r: r['articles'], responses))
articles = list(reduce(lambda x,y: x+y, articles))
news = pd.DataFrame(articles)
news = news.dropna()
news = news.drop_duplicates()
news.reset_index(inplace=True, drop=True)
d = mapping()
news['category'] = news['source'].map(lambda s: category(s, d))
news['scraping_date'] = datetime.now()
try:
aux = pd.read_csv('./data/news.csv')
aux = aux.append(news)
aux = aux.drop_duplicates('url')
aux.reset_index(inplace=True, drop=True)
aux.to_csv('./data/news.csv', encoding='utf-8', index=False)
except:
news.to_csv('./data/news.csv', index=False, encoding='utf-8')
print('Done')
if __name__=='__main__':
getDailyNews()
错误:
FileNotFoundError: [Errno 2] No such file or directory: './data/news.csv'
我知道我必须在pd.read_csv中给出路径名,但我不知道我必须在这里给出哪条路径。
如果您从中执行此程序的目录中还没有 data
文件夹,则此错误是有意义的。 post .
中也有类似的问题
我关注这个 tutorial 是为了从新闻站点检索数据。
主要功能是getDailyNews。它会在每个新闻源上循环,请求 api,提取数据并将其转储到 pandas DataFrame,然后将结果导出到 csv 文件。
但是当我 运行 代码时,出现错误。
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from tqdm import tqdm, tqdm_notebook
from functools import reduce
def getSources():
source_url = 'https://newsapi.org/v1/sources?language=en'
response = requests.get(source_url).json()
sources = []
for source in response['sources']:
sources.append(source['id'])
return sources
def mapping():
d = {}
response = requests.get('https://newsapi.org/v1/sources?language=en')
response = response.json()
for s in response['sources']:
d[s['id']] = s['category']
return d
def category(source, m):
try:
return m[source]
except:
return 'NC'
def getDailyNews():
sources = getSources()
key = '96f279e1b7f845669089abc016e915cc'
url = 'https://newsapi.org/v1/articles?source={0}&sortBy={1}&apiKey={2}'
responses = []
for i, source in tqdm_notebook(enumerate(sources), total=len(sources)):
try:
u = url.format(source, 'top', key)
except:
u = url.format(source, 'latest', key)
response = requests.get(u)
r = response.json()
try:
for article in r['articles']:
article['source'] = source
responses.append(r)
except:
print('Rate limit exceeded ... please wait and retry in 6 hours')
return None
articles = list(map(lambda r: r['articles'], responses))
articles = list(reduce(lambda x,y: x+y, articles))
news = pd.DataFrame(articles)
news = news.dropna()
news = news.drop_duplicates()
news.reset_index(inplace=True, drop=True)
d = mapping()
news['category'] = news['source'].map(lambda s: category(s, d))
news['scraping_date'] = datetime.now()
try:
aux = pd.read_csv('./data/news.csv')
aux = aux.append(news)
aux = aux.drop_duplicates('url')
aux.reset_index(inplace=True, drop=True)
aux.to_csv('./data/news.csv', encoding='utf-8', index=False)
except:
news.to_csv('./data/news.csv', index=False, encoding='utf-8')
print('Done')
if __name__=='__main__':
getDailyNews()
错误:
FileNotFoundError: [Errno 2] No such file or directory: './data/news.csv'
我知道我必须在pd.read_csv中给出路径名,但我不知道我必须在这里给出哪条路径。
如果您从中执行此程序的目录中还没有 data
文件夹,则此错误是有意义的。 post