使用 pandas 从 wunderground 抓取天气数据
Using pandas to scrape weather data from wundergound
我在 Shane Lynn 上发现了一组非常有用的脚本
Analysis of Weather data。第一个脚本,用于从 Weather Underground 中抓取数据,如下所示:
import requests
import pandas as pd
from dateutil import parser, rrule
from datetime import datetime, time, date
import time
def getRainfallData(station, day, month, year):
"""
Function to return a data frame of minute-level weather data for a single Wunderground PWS station.
Args:
station (string): Station code from the Wunderground website
day (int): Day of month for which data is requested
month (int): Month for which data is requested
year (int): Year for which data is requested
Returns:
Pandas Dataframe with weather data for specified station and date.
"""
url = "http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID={station}&day={day}&month={month}&year={year}&graphspan=day&format=1"
full_url = url.format(station=station, day=day, month=month, year=year)
# Request data from wunderground data
response = requests.get(full_url, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
data = response.text
# remove the excess <br> from the text data
data = data.replace('<br>', '')
# Convert to pandas dataframe (fails if issues with weather station)
try:
dataframe = pd.read_csv(io.StringIO(data), index_col=False)
dataframe['station'] = station
except Exception as e:
print("Issue with date: {}-{}-{} for station {}".format(day,month,year, station))
return None
return dataframe
# Generate a list of all of the dates we want data for
start_date = "2016-08-01"
end_date = "2016-08-31"
start = parser.parse(start_date)
end = parser.parse(end_date)
dates = list(rrule.rrule(rrule.DAILY, dtstart=start, until=end))
# Create a list of stations here to download data for
stations = ["ILONDON28"]
# Set a backoff time in seconds if a request fails
backoff_time = 10
data = {}
# Gather data for each station in turn and save to CSV.
for station in stations:
print("Working on {}".format(station))
data[station] = []
for date in dates:
# Print period status update messages
if date.day % 10 == 0:
print("Working on date: {} for station {}".format(date, station))
done = False
while done == False:
try:
weather_data = getRainfallData(station, date.day, date.month, date.year)
done = True
except ConnectionError as e:
# May get rate limited by Wunderground.com, backoff if so.
print("Got connection error on {}".format(date))
print("Will retry in {} seconds".format(backoff_time))
time.sleep(10)
# Add each processed date to the overall data
data[station].append(weather_data)
# Finally combine all of the individual days and output to CSV for analysis.
pd.concat(data[station]).to_csv("data/{}_weather.csv".format(station))
但是,我收到错误消息:
Working on ILONDONL28
Issue with date: 1-8-2016 for station ILONDONL28
Issue with date: 2-8-2016 for station ILONDONL28
Issue with date: 3-8-2016 for station ILONDONL28
Issue with date: 4-8-2016 for station ILONDONL28
Issue with date: 5-8-2016 for station ILONDONL28
Issue with date: 6-8-2016 for station ILONDONL28
谁能帮我解决这个错误?
所选站点和时间段的数据可用,如图link。
您得到的输出是因为出现异常。如果您添加了 print e
,您会发现这是因为脚本顶部缺少 import io
。其次,你给的站名少了一个字。尝试以下操作:
import io
import requests
import pandas as pd
from dateutil import parser, rrule
from datetime import datetime, time, date
import time
def getRainfallData(station, day, month, year):
"""
Function to return a data frame of minute-level weather data for a single Wunderground PWS station.
Args:
station (string): Station code from the Wunderground website
day (int): Day of month for which data is requested
month (int): Month for which data is requested
year (int): Year for which data is requested
Returns:
Pandas Dataframe with weather data for specified station and date.
"""
url = "http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID={station}&day={day}&month={month}&year={year}&graphspan=day&format=1"
full_url = url.format(station=station, day=day, month=month, year=year)
# Request data from wunderground data
response = requests.get(full_url)
data = response.text
# remove the excess <br> from the text data
data = data.replace('<br>', '')
# Convert to pandas dataframe (fails if issues with weather station)
try:
dataframe = pd.read_csv(io.StringIO(data), index_col=False)
dataframe['station'] = station
except Exception as e:
print("Issue with date: {}-{}-{} for station {}".format(day,month,year, station))
return None
return dataframe
# Generate a list of all of the dates we want data for
start_date = "2016-08-01"
end_date = "2016-08-31"
start = parser.parse(start_date)
end = parser.parse(end_date)
dates = list(rrule.rrule(rrule.DAILY, dtstart=start, until=end))
# Create a list of stations here to download data for
stations = ["ILONDONL28"]
# Set a backoff time in seconds if a request fails
backoff_time = 10
data = {}
# Gather data for each station in turn and save to CSV.
for station in stations:
print("Working on {}".format(station))
data[station] = []
for date in dates:
# Print period status update messages
if date.day % 10 == 0:
print("Working on date: {} for station {}".format(date, station))
done = False
while done == False:
try:
weather_data = getRainfallData(station, date.day, date.month, date.year)
done = True
except ConnectionError as e:
# May get rate limited by Wunderground.com, backoff if so.
print("Got connection error on {}".format(date))
print("Will retry in {} seconds".format(backoff_time))
time.sleep(10)
# Add each processed date to the overall data
data[station].append(weather_data)
# Finally combine all of the individual days and output to CSV for analysis.
pd.concat(data[station]).to_csv(r"data/{}_weather.csv".format(station))
为您提供一个 CSV 输出文件,开头如下:
,Time,TemperatureC,DewpointC,PressurehPa,WindDirection,WindDirectionDegrees,WindSpeedKMH,WindSpeedGustKMH,Humidity,HourlyPrecipMM,Conditions,Clouds,dailyrainMM,SoftwareType,DateUTC,station
0,2016-08-01 00:05:00,17.8,11.6,1017.5,ESE,120,0.0,0.0,67,0.0,,,0.0,WeatherCatV2.31B93,2016-07-31 23:05:00,ILONDONL28
1,2016-08-01 00:20:00,17.7,11.0,1017.5,SE,141,0.0,0.0,65,0.0,,,0.0,WeatherCatV2.31B93,2016-07-31 23:20:00,ILONDONL28
2,2016-08-01 00:35:00,17.5,10.8,1017.5,South,174,0.0,0.0,65,0.0,,,0.0,WeatherCatV2.31B93,2016-07-31 23:35:00,ILONDONL28
如果您没有获得 CSV 文件,我建议您添加输出文件名的完整路径。
我在 Shane Lynn 上发现了一组非常有用的脚本 Analysis of Weather data。第一个脚本,用于从 Weather Underground 中抓取数据,如下所示:
import requests
import pandas as pd
from dateutil import parser, rrule
from datetime import datetime, time, date
import time
def getRainfallData(station, day, month, year):
"""
Function to return a data frame of minute-level weather data for a single Wunderground PWS station.
Args:
station (string): Station code from the Wunderground website
day (int): Day of month for which data is requested
month (int): Month for which data is requested
year (int): Year for which data is requested
Returns:
Pandas Dataframe with weather data for specified station and date.
"""
url = "http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID={station}&day={day}&month={month}&year={year}&graphspan=day&format=1"
full_url = url.format(station=station, day=day, month=month, year=year)
# Request data from wunderground data
response = requests.get(full_url, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
data = response.text
# remove the excess <br> from the text data
data = data.replace('<br>', '')
# Convert to pandas dataframe (fails if issues with weather station)
try:
dataframe = pd.read_csv(io.StringIO(data), index_col=False)
dataframe['station'] = station
except Exception as e:
print("Issue with date: {}-{}-{} for station {}".format(day,month,year, station))
return None
return dataframe
# Generate a list of all of the dates we want data for
start_date = "2016-08-01"
end_date = "2016-08-31"
start = parser.parse(start_date)
end = parser.parse(end_date)
dates = list(rrule.rrule(rrule.DAILY, dtstart=start, until=end))
# Create a list of stations here to download data for
stations = ["ILONDON28"]
# Set a backoff time in seconds if a request fails
backoff_time = 10
data = {}
# Gather data for each station in turn and save to CSV.
for station in stations:
print("Working on {}".format(station))
data[station] = []
for date in dates:
# Print period status update messages
if date.day % 10 == 0:
print("Working on date: {} for station {}".format(date, station))
done = False
while done == False:
try:
weather_data = getRainfallData(station, date.day, date.month, date.year)
done = True
except ConnectionError as e:
# May get rate limited by Wunderground.com, backoff if so.
print("Got connection error on {}".format(date))
print("Will retry in {} seconds".format(backoff_time))
time.sleep(10)
# Add each processed date to the overall data
data[station].append(weather_data)
# Finally combine all of the individual days and output to CSV for analysis.
pd.concat(data[station]).to_csv("data/{}_weather.csv".format(station))
但是,我收到错误消息:
Working on ILONDONL28
Issue with date: 1-8-2016 for station ILONDONL28
Issue with date: 2-8-2016 for station ILONDONL28
Issue with date: 3-8-2016 for station ILONDONL28
Issue with date: 4-8-2016 for station ILONDONL28
Issue with date: 5-8-2016 for station ILONDONL28
Issue with date: 6-8-2016 for station ILONDONL28
谁能帮我解决这个错误?
所选站点和时间段的数据可用,如图link。
您得到的输出是因为出现异常。如果您添加了 print e
,您会发现这是因为脚本顶部缺少 import io
。其次,你给的站名少了一个字。尝试以下操作:
import io
import requests
import pandas as pd
from dateutil import parser, rrule
from datetime import datetime, time, date
import time
def getRainfallData(station, day, month, year):
"""
Function to return a data frame of minute-level weather data for a single Wunderground PWS station.
Args:
station (string): Station code from the Wunderground website
day (int): Day of month for which data is requested
month (int): Month for which data is requested
year (int): Year for which data is requested
Returns:
Pandas Dataframe with weather data for specified station and date.
"""
url = "http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID={station}&day={day}&month={month}&year={year}&graphspan=day&format=1"
full_url = url.format(station=station, day=day, month=month, year=year)
# Request data from wunderground data
response = requests.get(full_url)
data = response.text
# remove the excess <br> from the text data
data = data.replace('<br>', '')
# Convert to pandas dataframe (fails if issues with weather station)
try:
dataframe = pd.read_csv(io.StringIO(data), index_col=False)
dataframe['station'] = station
except Exception as e:
print("Issue with date: {}-{}-{} for station {}".format(day,month,year, station))
return None
return dataframe
# Generate a list of all of the dates we want data for
start_date = "2016-08-01"
end_date = "2016-08-31"
start = parser.parse(start_date)
end = parser.parse(end_date)
dates = list(rrule.rrule(rrule.DAILY, dtstart=start, until=end))
# Create a list of stations here to download data for
stations = ["ILONDONL28"]
# Set a backoff time in seconds if a request fails
backoff_time = 10
data = {}
# Gather data for each station in turn and save to CSV.
for station in stations:
print("Working on {}".format(station))
data[station] = []
for date in dates:
# Print period status update messages
if date.day % 10 == 0:
print("Working on date: {} for station {}".format(date, station))
done = False
while done == False:
try:
weather_data = getRainfallData(station, date.day, date.month, date.year)
done = True
except ConnectionError as e:
# May get rate limited by Wunderground.com, backoff if so.
print("Got connection error on {}".format(date))
print("Will retry in {} seconds".format(backoff_time))
time.sleep(10)
# Add each processed date to the overall data
data[station].append(weather_data)
# Finally combine all of the individual days and output to CSV for analysis.
pd.concat(data[station]).to_csv(r"data/{}_weather.csv".format(station))
为您提供一个 CSV 输出文件,开头如下:
,Time,TemperatureC,DewpointC,PressurehPa,WindDirection,WindDirectionDegrees,WindSpeedKMH,WindSpeedGustKMH,Humidity,HourlyPrecipMM,Conditions,Clouds,dailyrainMM,SoftwareType,DateUTC,station
0,2016-08-01 00:05:00,17.8,11.6,1017.5,ESE,120,0.0,0.0,67,0.0,,,0.0,WeatherCatV2.31B93,2016-07-31 23:05:00,ILONDONL28
1,2016-08-01 00:20:00,17.7,11.0,1017.5,SE,141,0.0,0.0,65,0.0,,,0.0,WeatherCatV2.31B93,2016-07-31 23:20:00,ILONDONL28
2,2016-08-01 00:35:00,17.5,10.8,1017.5,South,174,0.0,0.0,65,0.0,,,0.0,WeatherCatV2.31B93,2016-07-31 23:35:00,ILONDONL28
如果您没有获得 CSV 文件,我建议您添加输出文件名的完整路径。