从 url 获取编码的 csv 到 Pandas
Getting encoded csv from url into Pandas
我正在努力将此站点的以下内容 csv
导入 Pandas
。
我已经尝试了一些东西,但到目前为止我无法做出可行的 csv
。最终的目的是能够把它变成Pandasdataframe
。
谁能帮我指出正确的方向并解释为什么下面的方法不起作用?
使用Python3.7,Windows10
import requests
import urllib
import csv
csv_url = 'https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/773656/HMRC_spending_over_25000_for_December_2018.csv'
response = urllib.request.urlopen(csv_url)
cr = csv.reader(response)
for row in cr:
print(row)
# csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
response = urllib.request.urlopen(csv_url)
response = response.read().decode()
cr = csv.reader(response)
for row in cr:
print(row)
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 28452: invalid start byte
response = requests.get(csv_url).text
cr = csv.reader(response)
for row in cr:
print(row)
# malformed, prints individual characters
如果你使用pandas>=0.19.2
,你可以直接输入csv
url。:
import pandas as pd
url="https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/773656/HMRC_spending_over_25000_for_December_2018.csv"
c=pd.read_csv(url, encoding ='latin1') # otherwise you get a UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 12: invalid start byte
否则使用String.IO,即:
import pandas as pd
import requests
from io import StringIO
url="https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/773656/HMRC_spending_over_25000_for_December_2018.csv"
s=requests.get(url).content
c=pd.read_csv(StringIO(s.decode("latin1")))
这是一个编码问题,因为该文件似乎使用了 Windows 特定的编码。
df = pd.read_csv(url, encoding='cp1252')
应该可以。
将编码更改为 cp1252
import pandas as pd
import io
import requests
url="https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/773656/HMRC_spending_over_25000_for_December_2018.csv"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode("cp1252")))
我正在努力将此站点的以下内容 csv
导入 Pandas
。
我已经尝试了一些东西,但到目前为止我无法做出可行的 csv
。最终的目的是能够把它变成Pandasdataframe
。
谁能帮我指出正确的方向并解释为什么下面的方法不起作用?
使用Python3.7,Windows10
import requests
import urllib
import csv
csv_url = 'https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/773656/HMRC_spending_over_25000_for_December_2018.csv'
response = urllib.request.urlopen(csv_url)
cr = csv.reader(response)
for row in cr:
print(row)
# csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
response = urllib.request.urlopen(csv_url)
response = response.read().decode()
cr = csv.reader(response)
for row in cr:
print(row)
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 28452: invalid start byte
response = requests.get(csv_url).text
cr = csv.reader(response)
for row in cr:
print(row)
# malformed, prints individual characters
如果你使用pandas>=0.19.2
,你可以直接输入csv
url。:
import pandas as pd
url="https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/773656/HMRC_spending_over_25000_for_December_2018.csv"
c=pd.read_csv(url, encoding ='latin1') # otherwise you get a UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 12: invalid start byte
否则使用String.IO,即:
import pandas as pd
import requests
from io import StringIO
url="https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/773656/HMRC_spending_over_25000_for_December_2018.csv"
s=requests.get(url).content
c=pd.read_csv(StringIO(s.decode("latin1")))
这是一个编码问题,因为该文件似乎使用了 Windows 特定的编码。
df = pd.read_csv(url, encoding='cp1252')
应该可以。
将编码更改为 cp1252
import pandas as pd
import io
import requests
url="https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/773656/HMRC_spending_over_25000_for_December_2018.csv"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode("cp1252")))