我正在网上抓取产品和价格。输出结果中包含产品和价格之间的字符,我该如何删除它们
I am web-scraping for product and prices. The output is coming out with characters in between the product and prices how do i remove them
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("**website name**")
soup = BeautifulSoup(page.content,'html.parser')
books = soup.find('div',{'class':'row justify-content-md-first'})
#print(books)
items = books.find_all(class_='col-12')
#print(items[0].find(class_ ='product_title').get_text())
#print(items[0].find(class_ ='product_price').get_text())
product_titles = [item.find(class_ = 'product_title').get_text() for item in items]
product_prices = [item.find(class_ = 'product_price').get_text() for item in items]
print(product_titles)
#print(product_prices)
product_list = pd.DataFrame(
{'product_title':product_titles,
'product_price': product_prices,
})
print(product_list)
product_list.to_csv('Product.csv')
product_title product_price
0 \n\r\n\t\t\t\t\t\t\t\t\tAlphabet Dot–To–Dot – ... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
1 \n\r\n\t\t\t\t\t\t\t\t\tAlphabet Games and Puz... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
2 \n\r\n\t\t\t\t\t\t\t\t\tAlphabet Hidden Pictur... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
3 \n\r\n\t\t\t\t\t\t\t\t\tAmazing Mazes – Ages 4... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
4 \n\r\n\t\t\t\t\t\t\t\t\tEarly Maths – Ages 4–5... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
5 \n\r\n\t\t\t\t\t\t\t\t\tEarly Reading – Ages 4... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
6 \n\r\n\t\t\t\t\t\t\t\t\tLearning Centres – Sel... \r\n\t\t\t\t\t\t\t\tR210.00\r\n\t\t\t\t\t\t\t
7 \n\r\n\t\t\t\t\t\t\t\t\tLetters and Sounds – A... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
8 \n\r\n\t\t\t\t\t\t\t\t\tNumbers Dot–To–Dot – A... \r\n\t\t\t\t\t\t\t\tR64.00\r\n\t\t\t\t\t\t\t
9 \n\r\n\t\t\t\t\t\t\t\t\tNumbers Fun – Ages 4–5... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
10 \n\r\n\t\t\t\t\t\t\t\t\tNumbers Hidden Picture... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
11 \n\r\n\t\t\t\t\t\t\t\t\tPatterns and Sequence ... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
12 \n\r\n\t\t\t\t\t\t\t\t\tTracing and Cutting – ... \r\n\t\t\t\t\t\t\t\tR64.00\r\n\t\t\t\t\t\t\t
您可以使用 pandas.Series.str.strip()
删除前导和尾随字符。
product_list = product_list.apply(lambda col: col.str.strip())
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("**website name**")
soup = BeautifulSoup(page.content,'html.parser')
books = soup.find('div',{'class':'row justify-content-md-first'})
#print(books)
items = books.find_all(class_='col-12')
#print(items[0].find(class_ ='product_title').get_text())
#print(items[0].find(class_ ='product_price').get_text())
product_titles = [item.find(class_ = 'product_title').get_text() for item in items]
product_prices = [item.find(class_ = 'product_price').get_text() for item in items]
print(product_titles)
#print(product_prices)
product_list = pd.DataFrame(
{'product_title':product_titles,
'product_price': product_prices,
})
print(product_list)
product_list.to_csv('Product.csv')
product_title product_price
0 \n\r\n\t\t\t\t\t\t\t\t\tAlphabet Dot–To–Dot – ... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
1 \n\r\n\t\t\t\t\t\t\t\t\tAlphabet Games and Puz... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
2 \n\r\n\t\t\t\t\t\t\t\t\tAlphabet Hidden Pictur... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
3 \n\r\n\t\t\t\t\t\t\t\t\tAmazing Mazes – Ages 4... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
4 \n\r\n\t\t\t\t\t\t\t\t\tEarly Maths – Ages 4–5... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
5 \n\r\n\t\t\t\t\t\t\t\t\tEarly Reading – Ages 4... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
6 \n\r\n\t\t\t\t\t\t\t\t\tLearning Centres – Sel... \r\n\t\t\t\t\t\t\t\tR210.00\r\n\t\t\t\t\t\t\t
7 \n\r\n\t\t\t\t\t\t\t\t\tLetters and Sounds – A... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
8 \n\r\n\t\t\t\t\t\t\t\t\tNumbers Dot–To–Dot – A... \r\n\t\t\t\t\t\t\t\tR64.00\r\n\t\t\t\t\t\t\t
9 \n\r\n\t\t\t\t\t\t\t\t\tNumbers Fun – Ages 4–5... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
10 \n\r\n\t\t\t\t\t\t\t\t\tNumbers Hidden Picture... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
11 \n\r\n\t\t\t\t\t\t\t\t\tPatterns and Sequence ... \r\n\t\t\t\t\t\t\t\tR65.00\r\n\t\t\t\t\t\t\t
12 \n\r\n\t\t\t\t\t\t\t\t\tTracing and Cutting – ... \r\n\t\t\t\t\t\t\t\tR64.00\r\n\t\t\t\t\t\t\t
您可以使用 pandas.Series.str.strip()
删除前导和尾随字符。
product_list = product_list.apply(lambda col: col.str.strip())