python 网页抓取清理
python webscrape cleaning
我正在使用 python 和 beautifulsoup 来捕获和打印以下内容:
小的
5' × 10'
外部 unit/Drive-up 访问
56 美元/月。
店内 70 美元
我设法让它正确地打印出单位大小(小)和单位类型(在 unit/drive-up 访问之外),但是,其他人正在打印出正确的数据以及 "div class text."
有人知道我怎样才能正确抓取它吗?
我将添加以下代码;
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#setting my_url to the wesite
my_url = 'https://www.publicstorage.com/north-carolina/self-storage-
charlotte-nc/28206-self-storage/2334?
lat=35.23552&lng=-80.83296&clp=1&sp=Charlotte|35.2270869|-80.8431267&ismi=1'
#Opening up connection, grabbing the page
uClient = uReq(my_url)
#naming uClient to page_html
page_html = uClient.read()
#closing uClient
uClient.close()
#this does my html parsing
page_soup = soup(page_html, "html.parser")
#setting container to capture where the actual info is using inspect element
#grabs each product
containers = page_soup.findAll("li",{"class":"srp_res_row plp"})
filename = "product.csv"
f = open(filename, "w")
headers = "unit_size, size_dim, unit_type, online_price, reg_price\n"
f.write(headers)
for container in containers:
title_container = container.div.div
unit_size = title_container.text
size_dim = container.findAll("div", {"class":"srp_label srp_font_14"})
unit_container = container.li
unit_type = unit_container.text
online_price = container.findAll("div", {"class":"srp_label alt-price"})
reg_price = container.findAll("div", {"class":"reg-price"})
print("unit_size: " + str(unit_size))
print("size_dim: " + str(size_dim))
print("unit_type: " + str(unit_type))
print("online_price: " + str(online_price))
print("reg_price: " + str(reg_price))
f.write(str(unit_size) + "," +str(size_dim) + "," +str(unit_type) + ","
+str(online_price) + "," +str(reg_price) + "\n")
f.close()
<li class="srp_res_row plp">
<div class="srp_res_clm srp_clm160">
<div class="srp_label plp">Small</div>
<div class="srp_v-space_3"></div>
<div class="srp_label srp_font_14" style="padding-left: 5px;">5' x 10'</div>
<div class="srp_v-space_3"></div>
</div>
<div class="srp_res_clm srp_clm120">
<ul class="srp_list">
<li>Outside unit/Drive-up access</li>
</ul>
</div>
<div class="srp_res_clm srp_clm90">
<div class="srp_label"><span class="srp_label_symbol">†</span></div>
<div class="srp_v-space_10">1st Month</div>
</div>
<div class="srp_res_clm srp_clm90">
<div class="srp_label alt-price">/mo.</div>
<div class="online-special">Online Special<span class="srp_label_symbol">†</span></div>
<div class="srp_v-space_15"></div>
<div class="reg-price"> In-store</div>
</div>
<div class="srp_res_clm srp_clm100 srp_vcenter"><a class="srp_continue unit-no-deposit" data-deposit-amount="0" data-deposit-days="0" data-features="Outside unit/Drive-up access" data-marketing-size="5x10" data-ppk="altproduct_price" data-promotionid="132" data-siteid="2334" data-size-description="5' x 10'" data-sizeid="613573" data-wc2-unit="false" href="/ReservationDetails.aspx?st=2334&sz=613573&key=[rnd]&location=&plp=1&rk=&ismi=1&sp=Charlotte%7c35.2270869%7c-80.8431267&clp=1"><img alt="Continue" src="/images/srp-cont-new-80.png" style="width: 80px; height: 32px"/></a></div>
</li>
find_all
returns 一个 ResultSet 对象,您可以使用 for 循环对其进行迭代。
试试这个代码:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#setting my_url to the wesite
my_url = 'https://www.publicstorage.com/north-carolina/self-storage-charlotte-nc/28206-self-storage/2334?lat=35.23552&lng=-80.83296&clp=1&sp=Charlotte|35.2270869|-80.8431267&ismi=1'
#Opening up connection, grabbing the page
uClient = uReq(my_url)
#naming uClient to page_html
page_html = uClient.read()
#closing uClient
uClient.close()
#this does my html parsing
page_soup = soup(page_html, "html.parser")
#setting container to capture where the actual info is using inspect element
#grabs each product
containers = page_soup.findAll("li",{"class":"srp_res_row plp"})
filename = "product.csv"
f = open(filename, "w")
headers = "unit_size, size_dim, unit_type, online_price, reg_price\n"
f.write(headers)
for container in containers:
title_container = container.div.div
unit_size = title_container.text
size_dim = container.findAll("div", {"class":"srp_label srp_font_14"})
unit_container = container.li
unit_type = unit_container.text
online_price = container.findAll("div", {"class":"srp_label alt-price"})
reg_price = container.findAll("div", {"class":"reg-price"})
print("unit_size: " + str(unit_size))
print("size_dim: {}".format("".join(list(i.text for i in size_dim)))) #edited line
print("unit_type: " + str(unit_type))
print("Online Price: {}".format("".join(list(i.text for i in online_price)))) #edited line
print("Online Price: {}".format("".join(list(i.text for i in reg_price)))) #edited line
f.write(str(unit_size) + "," +str(size_dim) + "," +str(unit_type) + ","
+str(online_price) + "," +str(reg_price) + "\n")
f.close()
输出:
unit_size: Small
size_dim: 5' x 10'
unit_type: Outside unit/Drive-up access
Online Price: /mo.
Online Price: In-store
unit_size: Medium
size_dim: 5' x 15'
unit_type: Outside unit/Drive-up access
Online Price: /mo.
Online Price: 0 In-store
Updated code as per comments :
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#setting my_url to the wesite
my_url = 'https://www.publicstorage.com/north-carolina/self-storage-charlotte-nc/28206-self-storage/2334?lat=35.23552&lng=-80.83296&clp=1&sp=Charlotte|35.2270869|-80.8431267&ismi=1'
#Opening up connection, grabbing the page
uClient = uReq(my_url)
#naming uClient to page_html
page_html = uClient.read()
#closing uClient
uClient.close()
#this does my html parsing
page_soup = soup(page_html, "html.parser")
#setting container to capture where the actual info is using inspect element
#grabs each product
containers = page_soup.findAll("li",{"class":"srp_res_row plp"})
filename = "product.csv"
f = open(filename, "w")
headers = "unit_size, size_dim1, unit_type, online_price, reg_price\n"
f.write(headers)
for container in containers:
title_container = container.div.div
unit_size = title_container.text
size_dim = container.findAll("div", {"class":"srp_label srp_font_14"})
unit_container = container.li
unit_type = unit_container.text
online_price = container.findAll("div", {"class":"srp_label alt-price"})
reg_price = container.findAll("div", {"class":"reg-price"})
for item in zip(unit_size,size_dim,unit_container,online_price,reg_price):
csv=item[0] + "," + item[1].text + "," + item[2] + "," + item[3].text + "," + item[4].text + "\n"
f.write(csv)
我正在使用 python 和 beautifulsoup 来捕获和打印以下内容: 小的 5' × 10' 外部 unit/Drive-up 访问 56 美元/月。 店内 70 美元
我设法让它正确地打印出单位大小(小)和单位类型(在 unit/drive-up 访问之外),但是,其他人正在打印出正确的数据以及 "div class text."
有人知道我怎样才能正确抓取它吗? 我将添加以下代码;
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#setting my_url to the wesite
my_url = 'https://www.publicstorage.com/north-carolina/self-storage-
charlotte-nc/28206-self-storage/2334?
lat=35.23552&lng=-80.83296&clp=1&sp=Charlotte|35.2270869|-80.8431267&ismi=1'
#Opening up connection, grabbing the page
uClient = uReq(my_url)
#naming uClient to page_html
page_html = uClient.read()
#closing uClient
uClient.close()
#this does my html parsing
page_soup = soup(page_html, "html.parser")
#setting container to capture where the actual info is using inspect element
#grabs each product
containers = page_soup.findAll("li",{"class":"srp_res_row plp"})
filename = "product.csv"
f = open(filename, "w")
headers = "unit_size, size_dim, unit_type, online_price, reg_price\n"
f.write(headers)
for container in containers:
title_container = container.div.div
unit_size = title_container.text
size_dim = container.findAll("div", {"class":"srp_label srp_font_14"})
unit_container = container.li
unit_type = unit_container.text
online_price = container.findAll("div", {"class":"srp_label alt-price"})
reg_price = container.findAll("div", {"class":"reg-price"})
print("unit_size: " + str(unit_size))
print("size_dim: " + str(size_dim))
print("unit_type: " + str(unit_type))
print("online_price: " + str(online_price))
print("reg_price: " + str(reg_price))
f.write(str(unit_size) + "," +str(size_dim) + "," +str(unit_type) + ","
+str(online_price) + "," +str(reg_price) + "\n")
f.close()
<li class="srp_res_row plp">
<div class="srp_res_clm srp_clm160">
<div class="srp_label plp">Small</div>
<div class="srp_v-space_3"></div>
<div class="srp_label srp_font_14" style="padding-left: 5px;">5' x 10'</div>
<div class="srp_v-space_3"></div>
</div>
<div class="srp_res_clm srp_clm120">
<ul class="srp_list">
<li>Outside unit/Drive-up access</li>
</ul>
</div>
<div class="srp_res_clm srp_clm90">
<div class="srp_label"><span class="srp_label_symbol">†</span></div>
<div class="srp_v-space_10">1st Month</div>
</div>
<div class="srp_res_clm srp_clm90">
<div class="srp_label alt-price">/mo.</div>
<div class="online-special">Online Special<span class="srp_label_symbol">†</span></div>
<div class="srp_v-space_15"></div>
<div class="reg-price"> In-store</div>
</div>
<div class="srp_res_clm srp_clm100 srp_vcenter"><a class="srp_continue unit-no-deposit" data-deposit-amount="0" data-deposit-days="0" data-features="Outside unit/Drive-up access" data-marketing-size="5x10" data-ppk="altproduct_price" data-promotionid="132" data-siteid="2334" data-size-description="5' x 10'" data-sizeid="613573" data-wc2-unit="false" href="/ReservationDetails.aspx?st=2334&sz=613573&key=[rnd]&location=&plp=1&rk=&ismi=1&sp=Charlotte%7c35.2270869%7c-80.8431267&clp=1"><img alt="Continue" src="/images/srp-cont-new-80.png" style="width: 80px; height: 32px"/></a></div>
</li>
find_all
returns 一个 ResultSet 对象,您可以使用 for 循环对其进行迭代。
试试这个代码:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#setting my_url to the wesite
my_url = 'https://www.publicstorage.com/north-carolina/self-storage-charlotte-nc/28206-self-storage/2334?lat=35.23552&lng=-80.83296&clp=1&sp=Charlotte|35.2270869|-80.8431267&ismi=1'
#Opening up connection, grabbing the page
uClient = uReq(my_url)
#naming uClient to page_html
page_html = uClient.read()
#closing uClient
uClient.close()
#this does my html parsing
page_soup = soup(page_html, "html.parser")
#setting container to capture where the actual info is using inspect element
#grabs each product
containers = page_soup.findAll("li",{"class":"srp_res_row plp"})
filename = "product.csv"
f = open(filename, "w")
headers = "unit_size, size_dim, unit_type, online_price, reg_price\n"
f.write(headers)
for container in containers:
title_container = container.div.div
unit_size = title_container.text
size_dim = container.findAll("div", {"class":"srp_label srp_font_14"})
unit_container = container.li
unit_type = unit_container.text
online_price = container.findAll("div", {"class":"srp_label alt-price"})
reg_price = container.findAll("div", {"class":"reg-price"})
print("unit_size: " + str(unit_size))
print("size_dim: {}".format("".join(list(i.text for i in size_dim)))) #edited line
print("unit_type: " + str(unit_type))
print("Online Price: {}".format("".join(list(i.text for i in online_price)))) #edited line
print("Online Price: {}".format("".join(list(i.text for i in reg_price)))) #edited line
f.write(str(unit_size) + "," +str(size_dim) + "," +str(unit_type) + ","
+str(online_price) + "," +str(reg_price) + "\n")
f.close()
输出:
unit_size: Small
size_dim: 5' x 10'
unit_type: Outside unit/Drive-up access
Online Price: /mo.
Online Price: In-store
unit_size: Medium
size_dim: 5' x 15'
unit_type: Outside unit/Drive-up access
Online Price: /mo.
Online Price: 0 In-store
Updated code as per comments :
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#setting my_url to the wesite
my_url = 'https://www.publicstorage.com/north-carolina/self-storage-charlotte-nc/28206-self-storage/2334?lat=35.23552&lng=-80.83296&clp=1&sp=Charlotte|35.2270869|-80.8431267&ismi=1'
#Opening up connection, grabbing the page
uClient = uReq(my_url)
#naming uClient to page_html
page_html = uClient.read()
#closing uClient
uClient.close()
#this does my html parsing
page_soup = soup(page_html, "html.parser")
#setting container to capture where the actual info is using inspect element
#grabs each product
containers = page_soup.findAll("li",{"class":"srp_res_row plp"})
filename = "product.csv"
f = open(filename, "w")
headers = "unit_size, size_dim1, unit_type, online_price, reg_price\n"
f.write(headers)
for container in containers:
title_container = container.div.div
unit_size = title_container.text
size_dim = container.findAll("div", {"class":"srp_label srp_font_14"})
unit_container = container.li
unit_type = unit_container.text
online_price = container.findAll("div", {"class":"srp_label alt-price"})
reg_price = container.findAll("div", {"class":"reg-price"})
for item in zip(unit_size,size_dim,unit_container,online_price,reg_price):
csv=item[0] + "," + item[1].text + "," + item[2] + "," + item[3].text + "," + item[4].text + "\n"
f.write(csv)