网页抓取 html 使用 python

Web Scraping html using python

我正在尝试从中提取 2 组数据: “https://www.kucoin.com/news/categories/listing” 使用 python 脚本并将其放入列表或字典中。我已经尝试过 Selenium 和 BeautifulSoup 以及请求。 全部 return 为空:[] 或 None。我整天都在做这件事,但没有成功。我也尝试使用完整的 xpath 来尝试索引文本的位置,结果相同。在这一点上的任何帮助将不胜感激。

##########################################################
from bs4 import BeautifulSoup
import requests

url = requests.get('https://www.kucoin.com/news/categories/listing')
soup = BeautifulSoup(url.text, features="lxml")
listing = soup.find(class_='mainTitle___mbpq1')
print(listing) 
###########################################################
import requests
from lxml import html

def main():
url = "https://www.kucoin.com/news/categories/listing"
page = requests.get(url)
tree = html.fromstring(page.content)
text_val = tree.xpath('//div[@class="item___2ffLg"]')
print(text_val)
###########################################################

1st text between '(' ')', 2nd text is Date/Time after 'Trade: '

(我什至能够以实际包含我要查找的页面部分的文本格式获取页面的唯一方法是手动将其另存为 *.mhtml 格式。)

转到 Chrome 开发人员模式并刷新您的站点,然后转到网络选项卡左侧,您将获得搜索选项,只需先粘贴 Crypto War...行在那

现在你会得到URL,它是用来在网页中反映数据的,你可以点击headers得到URL并复制它并使用requests调用它returns json 响应

的模块
res=requests.get("https://www.kucoin.com/_api/cms/articles?page=1&pageSize=10&category=listing&lang=en_US")
res.json()

输出:

{'success': True,
 'code': 200,
 'msg': 'success',
 'timestamp': 1636695390265,
 'totalNum': 461,
 'items': [{'id': 10358,
   'title': 'Cryowar (CWAR) Gets Listed on KuCoin! World Premiere!',
   'summary': 'Trading: 14:00 on November 12, 2021 (UTC)',

    ...

图片:

我检查了 request.get 方法的响应,发现初始源代码很简单 javascript。您必须等待其执行完成才能解析最终呈现的 html。如果您尝试过使用硒没问题,那么这是我想出的解决方案来获取第一个元素。根据您的互联网连接速度调整超时

from selenium import webdriver

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Firefox()
driver.get("https://www.kucoin.com/news/categories/listing")
try:
    elem = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CLASS_NAME, "info___1vA3W"))
    )
    title = elem.find_element_by_tag_name("a")
    date_desc = elem.find_element_by_tag_name("p")
    title_text = title.text
    date_text = date_desc.text
    print(title_text, date_text)
finally:
    driver.quit()

通用方法:等待所有元素可见并循环遍历它们以打印元素。 运行 headless chrome 可以参考 ,这样它就不会打开浏览器 window。为方便起见添加了正则表达式搜索

import re
from selenium import webdriver

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.headless = True

driver = webdriver.Firefox(options = options)
driver.get("https://www.kucoin.com/news/categories/listing")
try:
    elements = WebDriverWait(driver, 5).until(
        EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@class, 'info___')]"))
    )
    for el in elements:
        title = el.find_element_by_tag_name("a")
        date_desc = el.find_element_by_tag_name("p")
        title_text = re.search(r'(?<=\()[A-Z]+', title.text, re.I)[0]
        date_text = re.search(r'(?<=Trading: ).+', date_desc.text)[0]
        print(f'Title: {title_text}; Date: {date_text}')
finally:
    driver.quit()

如前所述,数据由 API 加载。您可以使用 requests.

来提取详细信息

只试过 page 1

import requests

response = requests.get("https://www.kucoin.com/_api/cms/articles?page=1&pageSize=10&category=listing&lang=en_US")

jsoncode = response.json()

options = jsoncode['items']

for i in range(len(options)):
    title = options[i]['title']
    date = options[i]['summary']
    print(f"{title} : {date}")
Cryowar (CWAR) Gets Listed on KuCoin! World Premiere! : Trading: 14:00 on November 12, 2021 (UTC)
Deeper Network (DPR) Gets Listed on KuCoin! : Trading: 06:00 on November 12, 2021 (UTC)
Vectorspace AI  (VXV) Gets Listed on KuCoin! : Trading: 8:00 on November 12, 2021 (UTC)
...