如何将抓取的数据保存到 Python 中的 JSON
How to save scraped data to JSON in Python
我想抓取网站并将所需数据放入 JSON 文件。我反驳的问题是我得到了一个文本并且只能打印它。但我只需要在 JSON 文件中添加特定数据并在我的 类 中重复使用数据。我正在抓取的 WEB 和我的代码:
import requests
from bs4 import BeautifulSoup
URL = 'https://lt.brcauto.eu/automobiliu-paieska/'
req = requests.get(URL)
soup = BeautifulSoup(req.text, 'lxml')
pages = soup.find_all('li', class_ = 'page-item')[-2] #biggest page -2 ">" we need only before the last
cars_printed_counter = 0
for number in range(1, int(pages.text)):
req = requests.get(URL + '?page=' + str(number))
soup = BeautifulSoup(req.text, 'lxml')
if cars_printed_counter == 20:
break
for single_car in soup.find_all('div', class_ = 'cars-wrapper'):
if cars_printed_counter == 20:
break
Car_Title = single_car.find('h2', class_ = 'cars__title')
Car_Specs = single_car.find('p', class_ = 'cars__subtitle')
print('\nCar number:', cars_printed_counter + 1)
print(Car_Title.text)
print(Car_Specs.text)
cars_printed_counter += 1
我得到的数据是这样的:Printed results
Car number: 19
BMW 520 Gran Turismo M-Sport
2013 | 2.0 Diesel | Automation | 255229 km | 135 kW (184 AG) | Black
Car number: 20
BMW 750 i Automation
2005 | 5.0 Gasoline | Automation | 343906 km | 270 kW (367 AG) | Grey
问题是:我应该如何将数据放入 JSON 文件,使其看起来像这样:Desired json
[
{
"fuel": "diesel",
"title": "BMW 520 Gran Turismo M-Sport",
"year": 2013,
"run": 255229,
"type": "Black"
},
{
"fuel": "gasoline",
"title": "BMW 750 i Automation",
"year": 2005,
"run": 343906,
"type": "Grey"
},
你可以这样做。查看此 link,了解如何在 python
中创建字典
import json
# this is going to store your dicts of cars
list_of_printed_cars = []
for single_car in soup.find_all('div', class_ = 'cars-wrapper'):
if cars_printed_counter == 20:
break
Car_Title = single_car.find('h2', class_ = 'cars__title')
Car_Specs = single_car.find('p', class_ = 'cars__subtitle')
# printed_car is a dictionary of the car's title and specs
printed_car = {
'title': Car_Title.text,
'specs': Car_Specs.text
}
# this appends to a list that stores each car's title and specs
list_of_printed_cars.append(printed_car)
# to use list_of_printed_cars, you need to convert it to a json add it to a file
with open('data.json', 'w') as f:
json.dump(list_of_printed_cars, f)
然后您可以使用 list_of_printed_cars
的字典作为 json 通过使用 json.dumps
并将其保存到文件中
开门见山:
import requests
from bs4 import BeautifulSoup
import json
URL = 'https://lt.brcauto.eu/automobiliu-paieska/'
req = requests.get(URL)
soup = BeautifulSoup(req.text, 'lxml')
pages = soup.find_all('li', class_='page-item')[-2] # biggest page -2 ">" we need only before the last
cars_printed_counter = 0
for number in range(1, int(pages.text)):
req = requests.get(URL + '?page=' + str(number))
soup = BeautifulSoup(req.text, 'lxml')
if cars_printed_counter == 20:
break
out = []
for single_car in soup.find_all('div', class_='cars-wrapper'):
if cars_printed_counter == 20:
break
Car_Title = single_car.find('h2', class_='cars__title')
Car_Specs = single_car.find('p', class_='cars__subtitle')
print('\nCar number:', cars_printed_counter + 1)
print(Car_Title.text)
print(Car_Specs.text)
car = {}
car["title"] = Car_Title.text
subs = Car_Specs.text.split(' | ')
car["year"] = subs[0]
car["fuel"] = subs[1].split(" ")[1]
car["run"] = subs[3].split(" ")[0]
car["type"] = subs[5]
car["number"] = cars_printed_counter + 1
out.append(car)
cars_printed_counter += 1
print(json.dumps(out))
with open("outfile.json", "w") as f:
f.write(json.dumps(out))
解释:我们创建了一个 out
变量来保存所有的汽车。当我们循环它们时,我们创建了一个包含我们想要的值的字典。但由于 specs 是一个字符串,我们用“|”分割该字符串以获得单独的组件。然后只需将每个组件映射到字典中的一个成员。然后我们获取该字典并将其附加到 out
对象。总而言之,我们有一个字典列表,其中包含我们需要的所有信息。然后我们在该列表上调用 json.dumps()
以获取 json 并将其保存到文件中。
我想抓取网站并将所需数据放入 JSON 文件。我反驳的问题是我得到了一个文本并且只能打印它。但我只需要在 JSON 文件中添加特定数据并在我的 类 中重复使用数据。我正在抓取的 WEB 和我的代码:
import requests
from bs4 import BeautifulSoup
URL = 'https://lt.brcauto.eu/automobiliu-paieska/'
req = requests.get(URL)
soup = BeautifulSoup(req.text, 'lxml')
pages = soup.find_all('li', class_ = 'page-item')[-2] #biggest page -2 ">" we need only before the last
cars_printed_counter = 0
for number in range(1, int(pages.text)):
req = requests.get(URL + '?page=' + str(number))
soup = BeautifulSoup(req.text, 'lxml')
if cars_printed_counter == 20:
break
for single_car in soup.find_all('div', class_ = 'cars-wrapper'):
if cars_printed_counter == 20:
break
Car_Title = single_car.find('h2', class_ = 'cars__title')
Car_Specs = single_car.find('p', class_ = 'cars__subtitle')
print('\nCar number:', cars_printed_counter + 1)
print(Car_Title.text)
print(Car_Specs.text)
cars_printed_counter += 1
我得到的数据是这样的:Printed results
Car number: 19
BMW 520 Gran Turismo M-Sport
2013 | 2.0 Diesel | Automation | 255229 km | 135 kW (184 AG) | Black
Car number: 20
BMW 750 i Automation
2005 | 5.0 Gasoline | Automation | 343906 km | 270 kW (367 AG) | Grey
问题是:我应该如何将数据放入 JSON 文件,使其看起来像这样:Desired json
[
{
"fuel": "diesel",
"title": "BMW 520 Gran Turismo M-Sport",
"year": 2013,
"run": 255229,
"type": "Black"
},
{
"fuel": "gasoline",
"title": "BMW 750 i Automation",
"year": 2005,
"run": 343906,
"type": "Grey"
},
你可以这样做。查看此 link,了解如何在 python
中创建字典import json
# this is going to store your dicts of cars
list_of_printed_cars = []
for single_car in soup.find_all('div', class_ = 'cars-wrapper'):
if cars_printed_counter == 20:
break
Car_Title = single_car.find('h2', class_ = 'cars__title')
Car_Specs = single_car.find('p', class_ = 'cars__subtitle')
# printed_car is a dictionary of the car's title and specs
printed_car = {
'title': Car_Title.text,
'specs': Car_Specs.text
}
# this appends to a list that stores each car's title and specs
list_of_printed_cars.append(printed_car)
# to use list_of_printed_cars, you need to convert it to a json add it to a file
with open('data.json', 'w') as f:
json.dump(list_of_printed_cars, f)
然后您可以使用 list_of_printed_cars
的字典作为 json 通过使用 json.dumps
并将其保存到文件中
开门见山:
import requests
from bs4 import BeautifulSoup
import json
URL = 'https://lt.brcauto.eu/automobiliu-paieska/'
req = requests.get(URL)
soup = BeautifulSoup(req.text, 'lxml')
pages = soup.find_all('li', class_='page-item')[-2] # biggest page -2 ">" we need only before the last
cars_printed_counter = 0
for number in range(1, int(pages.text)):
req = requests.get(URL + '?page=' + str(number))
soup = BeautifulSoup(req.text, 'lxml')
if cars_printed_counter == 20:
break
out = []
for single_car in soup.find_all('div', class_='cars-wrapper'):
if cars_printed_counter == 20:
break
Car_Title = single_car.find('h2', class_='cars__title')
Car_Specs = single_car.find('p', class_='cars__subtitle')
print('\nCar number:', cars_printed_counter + 1)
print(Car_Title.text)
print(Car_Specs.text)
car = {}
car["title"] = Car_Title.text
subs = Car_Specs.text.split(' | ')
car["year"] = subs[0]
car["fuel"] = subs[1].split(" ")[1]
car["run"] = subs[3].split(" ")[0]
car["type"] = subs[5]
car["number"] = cars_printed_counter + 1
out.append(car)
cars_printed_counter += 1
print(json.dumps(out))
with open("outfile.json", "w") as f:
f.write(json.dumps(out))
解释:我们创建了一个 out
变量来保存所有的汽车。当我们循环它们时,我们创建了一个包含我们想要的值的字典。但由于 specs 是一个字符串,我们用“|”分割该字符串以获得单独的组件。然后只需将每个组件映射到字典中的一个成员。然后我们获取该字典并将其附加到 out
对象。总而言之,我们有一个字典列表,其中包含我们需要的所有信息。然后我们在该列表上调用 json.dumps()
以获取 json 并将其保存到文件中。