minIO 桶 |将字节转换为数据帧
minIO Bucket | convert Bytes to Dataframe
目标:从字节对象
创建一个pandas数据帧
我假设有一个标准程序。我以前没有处理过 bytes
。我可以看到与 \r\n
.
的一致性
\r
- 转义序列
\n
- 换行/行/记录
import pandas as pd
from sdg.datasource.MinioConn import MinioConn
client = MinioConn().client()
obj = client.get_object('project', 'foo/bar/Citizenship.csv')
data = obj.data # .decode('utf-8-sig')
print(data)
print(type(data))
输出:
b'\xef\xbb\xbfCitizenship\r\nAfghan\r\nAlbanian\r\nAlgerian\r\nAmerican\r\nAndorran\r\nAngolan\r\nAnguillan\r\nArgentine\r\nArmenian\r\nAustralian\r\nAustrian\r\nAzerbaijani\r\nBahamian\r\nBahraini\r\nBangladeshi\r\nBarbadian\r\nBelarusian\r\nBelgian\r\nBelizean\r\nBeninese\r\nBermudian\r\nBhutanese\r\nBolivian\r\nBotswanan\r\nBrazilian\r\nBritish\r\nBritish Virgin Islander\r\nBruneian\r\nBulgarian\r\nBurkinan\r\nBurmese\r\nBurundian\r\nCambodian\r\nCameroonian\r\nCanadian\r\nCape Verdean\r\nCayman Islander\r\nCentral African\r\nChadian\r\nChilean\r\nChinese\r\nCitizen of Antigua and Barbuda\r\nCitizen of Bosnia and Herzegovina\r\nCitizen of Guinea-Bissau\r\nCitizen of Kiribati\r\nCitizen of Seychelles\r\nCitizen of the Dominican Republic\r\nCitizen of Vanuatu\r\nColombian\r\nComoran\r\nCongolese (Congo)\r\nCongolese (DRC)\r\nCook Islander\r\nCosta Rican\r\nCroatian\r\nCuban\r\nCymraes\r\nCymro\r\nCypriot\r\nCzech\r\nDanish\r\nDjiboutian\r\nDominican\r\nDutch\r\nEast Timorese\r\nEcuadorean\r\nEgyptian\r\nEmirati\r\nEnglish\r\nEquatorial Guinean\r\nEritrean\r\nEstonian\r\nEthiopian\r\nFaroese\r\nFijian\r\nFilipino\r\nFinnish\r\nFrench\r\nGabonese\r\nGambian\r\nGeorgian\r\nGerman\r\nGhanaian\r\nGibraltarian\r\nGreek\r\nGreenlandic\r\nGrenadian\r\nGuamanian\r\nGuatemalan\r\nGuinean\r\nGuyanese\r\nHaitian\r\nHonduran\r\nHong Konger\r\nHungarian\r\nIcelandic\r\nIndian\r\nIndonesian\r\nIranian\r\nIraqi\r\nIrish\r\nIsraeli\r\nItalian\r\nIvorian\r\nJamaican\r\nJapanese\r\nJordanian\r\nKazakh\r\nKenyan\r\nKittitian\r\nKosovan\r\nKuwaiti\r\nKyrgyz\r\nLao\r\nLatvian\r\nLebanese\r\nLiberian\r\nLibyan\r\nLiechtenstein citizen\r\nLithuanian\r\nLuxembourger\r\nMacanese\r\nMacedonian\r\nMalagasy\r\nMalawian\r\nMalaysian\r\nMaldivian\r\nMalian\r\nMaltese\r\nMarshallese\r\nMartiniquais\r\nMauritanian\r\nMauritian\r\nMexican\r\nMicronesian\r\nMoldovan\r\nMonegasque\r\nMongolian\r\nMontenegrin\r\nMontserratian\r\nMoroccan\r\nMosotho\r\nMozambican\r\nNamibian\r\nNauruan\r\nNepalese\r\nNew Zealander\r\nNicaraguan\r\nNigerian\r\nNigerien\r\nNiuean\r\nNorth Korean\r\nNorthern Irish\r\nNorwegian\r\nOmani\r\nPakistani\r\nPalauan\r\nPalestinian\r\nPanamanian\r\nPapua New Guinean\r\nParaguayan\r\nPeruvian\r\nPitcairn Islander\r\nPolish\r\nPortuguese\r\nPrydeinig\r\nPuerto Rican\r\nQatari\r\nRomanian\r\nRussian\r\nRwandan\r\nSalvadorean\r\nSammarinese\r\nSamoan\r\nSao Tomean\r\nSaudi Arabian\r\nScottish\r\nSenegalese\r\nSerbian\r\nSierra Leonean\r\nSingaporean\r\nSlovak\r\nSlovenian\r\nSolomon Islander\r\nSomali\r\nSouth African\r\nSouth Korean\r\nSouth Sudanese\r\nSpanish\r\nSri Lankan\r\nSt Helenian\r\nSt Lucian\r\nStateless\r\nSudanese\r\nSurinamese\r\nSwazi\r\nSwedish\r\nSwiss\r\nSyrian\r\nTaiwanese\r\nTajik\r\nTanzanian\r\nThai\r\nTogolese\r\nTongan\r\nTrinidadian\r\nTristanian\r\nTunisian\r\nTurkish\r\nTurkmen\r\nTurks and Caicos Islander\r\nTuvaluan\r\nUgandan\r\nUkrainian\r\nUruguayan\r\nUzbek\r\nVatican citizen\r\nVenezuelan\r\nVietnamese\r\nVincentian\r\nWallisian\r\nWelsh\r\nYemeni\r\nZambian\r\nZimbabwean\r\n'
<class 'bytes'>
print(pd.read_csv(data))
print(type(pd.read_csv(data)))
输出:
Traceback (most recent call last):
File "sdg/industry/gri/test.py", line 11, in <module>
print(pd.read_csv(data))
File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 688, in read_csv
return _read(filepath_or_buffer, kwds)
File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 454, in _read
parser = TextFileReader(fp_or_buf, **kwds)
File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 948, in __init__
self._make_engine(self.engine)
File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 1180, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 2010, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 382, in pandas._libs.parsers.TextReader.__cinit__
File "pandas/_libs/parsers.pyx", line 687, in pandas._libs.parsers.TextReader._setup_parser_source
OSError: Expected file path name or file-like object, got <class 'bytes'> type
import pandas as pd
from sdg.datasource.MinioConn import MinioConn
from io import StringIO
def minio_download(filename):
client = MinioConn().client()
obj = client.get_object('project', f'foo/bar/{filename}')
data = obj.data
s = str(data,'utf-8')
data = StringIO(s)
df = pd.read_csv(data)
return df
df = minio_download('Citizenship.csv')
print(df)
print(type(df))
输出:
Citizenship
0 Afghan
1 Albanian
2 Algerian
3 American
4 Andorran
.. ...
220 Wallisian
221 Welsh
222 Yemeni
223 Zambian
224 Zimbabwean
[225 rows x 1 columns]
<class 'pandas.core.frame.DataFrame'>
目标:从字节对象
创建一个pandas数据帧我假设有一个标准程序。我以前没有处理过 bytes
。我可以看到与 \r\n
.
\r
- 转义序列
\n
- 换行/行/记录
import pandas as pd
from sdg.datasource.MinioConn import MinioConn
client = MinioConn().client()
obj = client.get_object('project', 'foo/bar/Citizenship.csv')
data = obj.data # .decode('utf-8-sig')
print(data)
print(type(data))
输出:
b'\xef\xbb\xbfCitizenship\r\nAfghan\r\nAlbanian\r\nAlgerian\r\nAmerican\r\nAndorran\r\nAngolan\r\nAnguillan\r\nArgentine\r\nArmenian\r\nAustralian\r\nAustrian\r\nAzerbaijani\r\nBahamian\r\nBahraini\r\nBangladeshi\r\nBarbadian\r\nBelarusian\r\nBelgian\r\nBelizean\r\nBeninese\r\nBermudian\r\nBhutanese\r\nBolivian\r\nBotswanan\r\nBrazilian\r\nBritish\r\nBritish Virgin Islander\r\nBruneian\r\nBulgarian\r\nBurkinan\r\nBurmese\r\nBurundian\r\nCambodian\r\nCameroonian\r\nCanadian\r\nCape Verdean\r\nCayman Islander\r\nCentral African\r\nChadian\r\nChilean\r\nChinese\r\nCitizen of Antigua and Barbuda\r\nCitizen of Bosnia and Herzegovina\r\nCitizen of Guinea-Bissau\r\nCitizen of Kiribati\r\nCitizen of Seychelles\r\nCitizen of the Dominican Republic\r\nCitizen of Vanuatu\r\nColombian\r\nComoran\r\nCongolese (Congo)\r\nCongolese (DRC)\r\nCook Islander\r\nCosta Rican\r\nCroatian\r\nCuban\r\nCymraes\r\nCymro\r\nCypriot\r\nCzech\r\nDanish\r\nDjiboutian\r\nDominican\r\nDutch\r\nEast Timorese\r\nEcuadorean\r\nEgyptian\r\nEmirati\r\nEnglish\r\nEquatorial Guinean\r\nEritrean\r\nEstonian\r\nEthiopian\r\nFaroese\r\nFijian\r\nFilipino\r\nFinnish\r\nFrench\r\nGabonese\r\nGambian\r\nGeorgian\r\nGerman\r\nGhanaian\r\nGibraltarian\r\nGreek\r\nGreenlandic\r\nGrenadian\r\nGuamanian\r\nGuatemalan\r\nGuinean\r\nGuyanese\r\nHaitian\r\nHonduran\r\nHong Konger\r\nHungarian\r\nIcelandic\r\nIndian\r\nIndonesian\r\nIranian\r\nIraqi\r\nIrish\r\nIsraeli\r\nItalian\r\nIvorian\r\nJamaican\r\nJapanese\r\nJordanian\r\nKazakh\r\nKenyan\r\nKittitian\r\nKosovan\r\nKuwaiti\r\nKyrgyz\r\nLao\r\nLatvian\r\nLebanese\r\nLiberian\r\nLibyan\r\nLiechtenstein citizen\r\nLithuanian\r\nLuxembourger\r\nMacanese\r\nMacedonian\r\nMalagasy\r\nMalawian\r\nMalaysian\r\nMaldivian\r\nMalian\r\nMaltese\r\nMarshallese\r\nMartiniquais\r\nMauritanian\r\nMauritian\r\nMexican\r\nMicronesian\r\nMoldovan\r\nMonegasque\r\nMongolian\r\nMontenegrin\r\nMontserratian\r\nMoroccan\r\nMosotho\r\nMozambican\r\nNamibian\r\nNauruan\r\nNepalese\r\nNew Zealander\r\nNicaraguan\r\nNigerian\r\nNigerien\r\nNiuean\r\nNorth Korean\r\nNorthern Irish\r\nNorwegian\r\nOmani\r\nPakistani\r\nPalauan\r\nPalestinian\r\nPanamanian\r\nPapua New Guinean\r\nParaguayan\r\nPeruvian\r\nPitcairn Islander\r\nPolish\r\nPortuguese\r\nPrydeinig\r\nPuerto Rican\r\nQatari\r\nRomanian\r\nRussian\r\nRwandan\r\nSalvadorean\r\nSammarinese\r\nSamoan\r\nSao Tomean\r\nSaudi Arabian\r\nScottish\r\nSenegalese\r\nSerbian\r\nSierra Leonean\r\nSingaporean\r\nSlovak\r\nSlovenian\r\nSolomon Islander\r\nSomali\r\nSouth African\r\nSouth Korean\r\nSouth Sudanese\r\nSpanish\r\nSri Lankan\r\nSt Helenian\r\nSt Lucian\r\nStateless\r\nSudanese\r\nSurinamese\r\nSwazi\r\nSwedish\r\nSwiss\r\nSyrian\r\nTaiwanese\r\nTajik\r\nTanzanian\r\nThai\r\nTogolese\r\nTongan\r\nTrinidadian\r\nTristanian\r\nTunisian\r\nTurkish\r\nTurkmen\r\nTurks and Caicos Islander\r\nTuvaluan\r\nUgandan\r\nUkrainian\r\nUruguayan\r\nUzbek\r\nVatican citizen\r\nVenezuelan\r\nVietnamese\r\nVincentian\r\nWallisian\r\nWelsh\r\nYemeni\r\nZambian\r\nZimbabwean\r\n'
<class 'bytes'>
print(pd.read_csv(data))
print(type(pd.read_csv(data)))
输出:
Traceback (most recent call last):
File "sdg/industry/gri/test.py", line 11, in <module>
print(pd.read_csv(data))
File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 688, in read_csv
return _read(filepath_or_buffer, kwds)
File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 454, in _read
parser = TextFileReader(fp_or_buf, **kwds)
File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 948, in __init__
self._make_engine(self.engine)
File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 1180, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 2010, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 382, in pandas._libs.parsers.TextReader.__cinit__
File "pandas/_libs/parsers.pyx", line 687, in pandas._libs.parsers.TextReader._setup_parser_source
OSError: Expected file path name or file-like object, got <class 'bytes'> type
import pandas as pd
from sdg.datasource.MinioConn import MinioConn
from io import StringIO
def minio_download(filename):
client = MinioConn().client()
obj = client.get_object('project', f'foo/bar/{filename}')
data = obj.data
s = str(data,'utf-8')
data = StringIO(s)
df = pd.read_csv(data)
return df
df = minio_download('Citizenship.csv')
print(df)
print(type(df))
输出:
Citizenship
0 Afghan
1 Albanian
2 Algerian
3 American
4 Andorran
.. ...
220 Wallisian
221 Welsh
222 Yemeni
223 Zambian
224 Zimbabwean
[225 rows x 1 columns]
<class 'pandas.core.frame.DataFrame'>