从 html 页面中提取 URL

Extracting URL from html page

我正在使用以下代码:

import requests, pandas as pd
from bs4 import BeautifulSoup

if __name__ == '__main__':
    url = r'https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1'
    soup = BeautifulSoup(requests.get(url).text, "lxml").find_all("div", class_="mpi_info")

我正在尝试在数据框中获取所有像 "/homedetail/30729-mcguinness-dr-spring-tx-77386/5204857" 这样的网址,但不确定如何去做。

import requests, pandas as pd
from bs4 import BeautifulSoup
def scraper():
    lst = []
    url = r'https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1'
    soup = BeautifulSoup(requests.get(url).text)
    for i in soup.find_all('a'):
        if i.get('href') and i.get('href').startswith('/homedetail/'): 
            lst.append(i['href'])
    return lst

if __name__ == '__main__':
    urls = scraper()
    df = pd.DataFrame(urls)
    print(df)
    

输出:

0   /homedetail/30729-mcguinness-dr-spring-tx-7738...
1   /homedetail/30729-mcguinness-dr-spring-tx-7738...
2     /homedetail/11-dovecote-spring-tx-77382/5323232
3     /homedetail/11-dovecote-spring-tx-77382/5323232
4   /homedetail/9934-crestwater-cir-magnolia-tx-77...
5   /homedetail/9934-crestwater-cir-magnolia-tx-77...
6   /homedetail/3-shanewood-ct-spring-tx-77382/532...
7   /homedetail/3-shanewood-ct-spring-tx-77382/532...
8   /homedetail/22-solebrook-path-tomball-tx-77375...
9   /homedetail/22-solebrook-path-tomball-tx-77375...
10  /homedetail/24-snowdrop-lily-dr-tomball-tx-773...
11  /homedetail/24-snowdrop-lily-dr-tomball-tx-773...
12  /homedetail/26-freestone-pl-spring-tx-77382/97...
13  /homedetail/26-freestone-pl-spring-tx-77382/97...
14  /homedetail/8557-alford-point-dr-magnolia-tx-7...
15  /homedetail/8557-alford-point-dr-magnolia-tx-7...
16  /homedetail/210-spyglass-park-loop-montgomery-...
17  /homedetail/210-spyglass-park-loop-montgomery-...
18  /homedetail/6-rosedown-pl-spring-tx-77382/5329545
19  /homedetail/6-rosedown-pl-spring-tx-77382/5329545
20  /homedetail/51-lenox-hill-dr-spring-tx-77382/5...
21  /homedetail/51-lenox-hill-dr-spring-tx-77382/5...
22  /homedetail/19-s-garnet-bnd-spring-tx-77382/91...
23  /homedetail/19-s-garnet-bnd-spring-tx-77382/91...

地址在class“地址”下。创建一个包含所有 href 的列表并将其传递给 DataFrame

import requests, pandas as pd
from bs4 import BeautifulSoup


url = "https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1"
soup = BeautifulSoup(requests.get(url).text, "lxml")


address_links = [tag["href"] for tag in soup.find_all("a", class_="address")]
df = pd.DataFrame(address_links)
print(df.to_string())

输出:

                                                                         0
0                   /homedetail/30729-mcguinness-dr-spring-tx-77386/5204857
1                           /homedetail/11-dovecote-spring-tx-77382/5323232
2                /homedetail/9934-crestwater-cir-magnolia-tx-77354/11567525
3                        /homedetail/3-shanewood-ct-spring-tx-77382/5325643
4                   /homedetail/22-solebrook-path-tomball-tx-77375/12190176
5                 /homedetail/24-snowdrop-lily-dr-tomball-tx-77375/14652805
6                       /homedetail/26-freestone-pl-spring-tx-77382/9791228
7   /homedetail/8557-alford-point-dr-magnolia-tx-77354/13580284?lid=6218369
8           /homedetail/210-spyglass-park-loop-montgomery-tx-77316/12783261
9                         /homedetail/6-rosedown-pl-spring-tx-77382/5329545
10                     /homedetail/51-lenox-hill-dr-spring-tx-77382/5331072
11                      /homedetail/19-s-garnet-bnd-spring-tx-77382/9164284