CSV 文件未在 Pandas 中正确输入数据框
CSV file not properly entering dataframe in Pandas
我正在尝试将从 Yahoo Finance 下载的 CSV 文件作为数据帧导入 pandas。美国股票有效,例如股票代码 SPY 或 FB,但在加拿大交易所上市的股票则无效,例如 SU 或 CRE。来自 Yahoo 的 CSV 文件在文本编辑器中打开时显示完全相同:
Date,Open,High,Low,Close,Adj Close,Volume
每个格式都完全相同(例如,每个日期都是 YYYY-MM-DD。)但我收到此错误:"ValueError: can only convert an array of size 1 to a Python scalar" 任何时候我尝试使用加拿大自动收报机。这是我的代码:
import os
import pandas as pd
import matplotlib.pyplot as plt
def symbol_to_path(symbol, base_dir="data"):
"""Return CSV file path given ticker symbol."""
return os.path.join(base_dir, "{}.csv".format(str(symbol)))
def get_data(symbols, dates):
"""Read stock data (adjusted close) for given symbols from CSV files."""
df = pd.DataFrame(index=dates)
if 'SPY' not in symbols: # add SPY for reference, if absent
symbols.insert(0, 'SPY')
for symbol in symbols:
df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
parse_dates=True,
usecols=['Date', 'Adj Close'],
na_values=['nan'])
df_temp = df_temp.rename(columns={'Adj Close': symbol})
print df_temp
df = df.join(df_temp)
if symbol == 'SPY': # drop dates SPY did not trade
df = df.dropna(subset=["SPY"])
return df
def plot_data(df, title="Stock prices", xlabel="Date", ylabel="Price"):
"""Plot stock prices with a custom title and meaningful axis labels."""
ax = df.plot(title=title, fontsize=12)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
plt.show()
def compute_daily_returns(df):
"""Compute and return the daily return values."""
# TODO: Your code here
# Note: Returned DataFrame must have the same number of rows
# daily_returns = df.copy() # copy DataFrame to match size and column names
# compute daily returns for row 1 onwards
# daily_returns[1:] = (df[1:] / df[:-1].values) - 1
# daily_returns.ix[0, :] = 0 # set daily returns for row 0 to 0
# Another way to compute daily returns using .shift
daily_returns = (df / df.shift(1)) - 1
return daily_returns
def compute_cumulative_returns(df):
cumulative_return = df.copy()
cumulative_return = (df.iloc[:] / df.iloc[0]) - 1
return cumulative_return
def normalize_data(df):
# Normalize stock prices by dividing by the first row of the dataframe
return df / df.iloc[0,:]
def test_run():
# Read data
dates = pd.date_range('2017-01-01', '2017-12-31')
# SPY, FB both work perfectly, SU breaks
symbols = ['SPY', 'FB', 'SU']
df = get_data(symbols, dates)
plot_data(df)
plot_data(normalize_data(df), title="Normalized return", ylabel="Normalized return")
# Compute daily returns
daily_returns = compute_daily_returns(df)
plot_data(daily_returns, title="Daily returns", ylabel="Daily returns")
# Compute cumulative returns
cumulative_returns = compute_cumulative_returns(df)
plot_data(cumulative_returns, title="Cumulative returns", ylabel = "Cumulative return", xlabel = "Time")
if __name__ == "__main__":
test_run()
此代码假定在 运行 目录中,有一个名为 'data'(不带引号)的文件夹,其中包含与 'data' 中提供的代码同名的 CSV 文件=31=] test_run() 中的数组。从这里为 SU 下载的数据:
来自 SPY:
我知道 SPY 在 SU 之前开始交易,但 FB 也比 SPY 开始交易晚得多,并且使用提供的代码。
SU.TO.csv 文件包含具有 'null' 个值的行。
例如第 68 行:
1995-04-13,2.383130,2.383130,2.359380,2.359380,0.963797,3283200
1995-04-14,null,null,null,null,null,null
1995-04-17,2.390620,2.390620,2.359380,2.359380,0.963797,600000
因此您必须在代码中处理好这一点。
也许你可以使用 na_values=['null']
而不是 na_values=['nan']
?
我正在尝试将从 Yahoo Finance 下载的 CSV 文件作为数据帧导入 pandas。美国股票有效,例如股票代码 SPY 或 FB,但在加拿大交易所上市的股票则无效,例如 SU 或 CRE。来自 Yahoo 的 CSV 文件在文本编辑器中打开时显示完全相同:
Date,Open,High,Low,Close,Adj Close,Volume
每个格式都完全相同(例如,每个日期都是 YYYY-MM-DD。)但我收到此错误:"ValueError: can only convert an array of size 1 to a Python scalar" 任何时候我尝试使用加拿大自动收报机。这是我的代码:
import os
import pandas as pd
import matplotlib.pyplot as plt
def symbol_to_path(symbol, base_dir="data"):
"""Return CSV file path given ticker symbol."""
return os.path.join(base_dir, "{}.csv".format(str(symbol)))
def get_data(symbols, dates):
"""Read stock data (adjusted close) for given symbols from CSV files."""
df = pd.DataFrame(index=dates)
if 'SPY' not in symbols: # add SPY for reference, if absent
symbols.insert(0, 'SPY')
for symbol in symbols:
df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
parse_dates=True,
usecols=['Date', 'Adj Close'],
na_values=['nan'])
df_temp = df_temp.rename(columns={'Adj Close': symbol})
print df_temp
df = df.join(df_temp)
if symbol == 'SPY': # drop dates SPY did not trade
df = df.dropna(subset=["SPY"])
return df
def plot_data(df, title="Stock prices", xlabel="Date", ylabel="Price"):
"""Plot stock prices with a custom title and meaningful axis labels."""
ax = df.plot(title=title, fontsize=12)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
plt.show()
def compute_daily_returns(df):
"""Compute and return the daily return values."""
# TODO: Your code here
# Note: Returned DataFrame must have the same number of rows
# daily_returns = df.copy() # copy DataFrame to match size and column names
# compute daily returns for row 1 onwards
# daily_returns[1:] = (df[1:] / df[:-1].values) - 1
# daily_returns.ix[0, :] = 0 # set daily returns for row 0 to 0
# Another way to compute daily returns using .shift
daily_returns = (df / df.shift(1)) - 1
return daily_returns
def compute_cumulative_returns(df):
cumulative_return = df.copy()
cumulative_return = (df.iloc[:] / df.iloc[0]) - 1
return cumulative_return
def normalize_data(df):
# Normalize stock prices by dividing by the first row of the dataframe
return df / df.iloc[0,:]
def test_run():
# Read data
dates = pd.date_range('2017-01-01', '2017-12-31')
# SPY, FB both work perfectly, SU breaks
symbols = ['SPY', 'FB', 'SU']
df = get_data(symbols, dates)
plot_data(df)
plot_data(normalize_data(df), title="Normalized return", ylabel="Normalized return")
# Compute daily returns
daily_returns = compute_daily_returns(df)
plot_data(daily_returns, title="Daily returns", ylabel="Daily returns")
# Compute cumulative returns
cumulative_returns = compute_cumulative_returns(df)
plot_data(cumulative_returns, title="Cumulative returns", ylabel = "Cumulative return", xlabel = "Time")
if __name__ == "__main__":
test_run()
此代码假定在 运行 目录中,有一个名为 'data'(不带引号)的文件夹,其中包含与 'data' 中提供的代码同名的 CSV 文件=31=] test_run() 中的数组。从这里为 SU 下载的数据:
来自 SPY:
我知道 SPY 在 SU 之前开始交易,但 FB 也比 SPY 开始交易晚得多,并且使用提供的代码。
SU.TO.csv 文件包含具有 'null' 个值的行。
例如第 68 行:
1995-04-13,2.383130,2.383130,2.359380,2.359380,0.963797,3283200
1995-04-14,null,null,null,null,null,null
1995-04-17,2.390620,2.390620,2.359380,2.359380,0.963797,600000
因此您必须在代码中处理好这一点。
也许你可以使用 na_values=['null']
而不是 na_values=['nan']
?