Pandas 从 s3 读取 excel 文件并应用一些操作并将文件写入同一位置
Pandas to read a excel file from s3 and apply some operation and write the file in same location
我正在使用 pandas 从 s3 读取一个 excel 文件,我将在其中一个列中执行一些操作并在同一位置写入新版本。基本上新版本会覆盖原来的版本
使用 csv 文件我可以使用以下代码实现但不确定 excel(.xlsx)。请有人帮忙。
key = source_path + folder_name + "/" + file_name
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucketname, Key=key)
csv_buf = StringIO()
df = pd.read_csv(obj['Body'])
df["col1"] = df["col1"] * 100
df.to_csv(csv_buf, index=False)
csv_buf.seek(0)
s3.put_object(Bucket=bucketname, Body=csv_buf.getvalue(), Key=key)
我尝试使用相同的代码使用 read_excel 和 to_excel 得到 UnsupportedOperation: 寻求错误。
key = source_path + folder_name + "/" + file_name
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucketname, Key=key)
csv_buf = StringIO()
df = pd.read_excel(obj['Body'])
df["col1"] = df["col1"] * 100
df.to_excel(csv_buf, index=False)
csv_buf.seek(0)
s3.put_object(Bucket=bucketname, Body=csv_buf.getvalue(), Key=key)
错误信息:
---------------------------------------------------------------------------
UnsupportedOperation Traceback (most recent call last)
<ipython-input-298-9363cf7d6609> in <module>
6 obj = s3.get_object(Bucket=raw_bucket, Key=key)
7 csv_buf = StringIO()
----> 8 df = pd.read_excel(obj['Body'])
9 df["patient_ID"] = df["patient_ID"] * 100
10 df.to_excel(csv_buf, index=False)
~\anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
297 )
298 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
--> 299 return func(*args, **kwargs)
300
301 return wrapper
~\anaconda3\lib\site-packages\pandas\io\excel\_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
334 if not isinstance(io, ExcelFile):
335 should_close = True
--> 336 io = ExcelFile(io, storage_options=storage_options, engine=engine)
337 elif engine and engine != io.engine:
338 raise ValueError(
~\anaconda3\lib\site-packages\pandas\io\excel\_base.py in __init__(self, path_or_buffer, engine, storage_options)
1069 ext = "xls"
1070 else:
-> 1071 ext = inspect_excel_format(
1072 content=path_or_buffer, storage_options=storage_options
1073 )
~\anaconda3\lib\site-packages\pandas\io\excel\_base.py in inspect_excel_format(path, content, storage_options)
951 ) as handle:
952 stream = handle.handle
--> 953 stream.seek(0)
954 buf = stream.read(PEEK_SIZE)
955 if buf is None:
UnsupportedOperation: seek
示例文件
s.no patient_id
1 100
2 200
3 300
4 400
5 500
尝试以下对我有用的解决方案。现在可以直接传文件的s3位置了
key = source_path + folder_name + "/" + file_name
path = "s3://" + bucket_name + "/" + key
if target_file[num].endswith(".xlsx"):
df = pd.read_excel(path)
df["col1"] = df["col1"] * 100
df.to_excel(path)
elif target_file[num].endswith(".csv"):
df = pd.read_csv(path)
df["col1"] = df["col1"] * 100
df.to_csv(path)
我正在使用 pandas 从 s3 读取一个 excel 文件,我将在其中一个列中执行一些操作并在同一位置写入新版本。基本上新版本会覆盖原来的版本
使用 csv 文件我可以使用以下代码实现但不确定 excel(.xlsx)。请有人帮忙。
key = source_path + folder_name + "/" + file_name
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucketname, Key=key)
csv_buf = StringIO()
df = pd.read_csv(obj['Body'])
df["col1"] = df["col1"] * 100
df.to_csv(csv_buf, index=False)
csv_buf.seek(0)
s3.put_object(Bucket=bucketname, Body=csv_buf.getvalue(), Key=key)
我尝试使用相同的代码使用 read_excel 和 to_excel 得到 UnsupportedOperation: 寻求错误。
key = source_path + folder_name + "/" + file_name
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucketname, Key=key)
csv_buf = StringIO()
df = pd.read_excel(obj['Body'])
df["col1"] = df["col1"] * 100
df.to_excel(csv_buf, index=False)
csv_buf.seek(0)
s3.put_object(Bucket=bucketname, Body=csv_buf.getvalue(), Key=key)
错误信息:
---------------------------------------------------------------------------
UnsupportedOperation Traceback (most recent call last)
<ipython-input-298-9363cf7d6609> in <module>
6 obj = s3.get_object(Bucket=raw_bucket, Key=key)
7 csv_buf = StringIO()
----> 8 df = pd.read_excel(obj['Body'])
9 df["patient_ID"] = df["patient_ID"] * 100
10 df.to_excel(csv_buf, index=False)
~\anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
297 )
298 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
--> 299 return func(*args, **kwargs)
300
301 return wrapper
~\anaconda3\lib\site-packages\pandas\io\excel\_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
334 if not isinstance(io, ExcelFile):
335 should_close = True
--> 336 io = ExcelFile(io, storage_options=storage_options, engine=engine)
337 elif engine and engine != io.engine:
338 raise ValueError(
~\anaconda3\lib\site-packages\pandas\io\excel\_base.py in __init__(self, path_or_buffer, engine, storage_options)
1069 ext = "xls"
1070 else:
-> 1071 ext = inspect_excel_format(
1072 content=path_or_buffer, storage_options=storage_options
1073 )
~\anaconda3\lib\site-packages\pandas\io\excel\_base.py in inspect_excel_format(path, content, storage_options)
951 ) as handle:
952 stream = handle.handle
--> 953 stream.seek(0)
954 buf = stream.read(PEEK_SIZE)
955 if buf is None:
UnsupportedOperation: seek
示例文件
s.no patient_id 1 100 2 200 3 300 4 400 5 500
尝试以下对我有用的解决方案。现在可以直接传文件的s3位置了
key = source_path + folder_name + "/" + file_name
path = "s3://" + bucket_name + "/" + key
if target_file[num].endswith(".xlsx"):
df = pd.read_excel(path)
df["col1"] = df["col1"] * 100
df.to_excel(path)
elif target_file[num].endswith(".csv"):
df = pd.read_csv(path)
df["col1"] = df["col1"] * 100
df.to_csv(path)