按 id 解析 XML 中的特定项目
Parse specific item in XML by id
我是初学者,所以为了提高自己,我正在做这些事情。
我正在尝试获取具有 ID 的特定 rss/xml 项目。
我想通过使用“post-id”获取特定的博客 post 内容,我该怎么做?
抱歉,如果这是一个常见问题,但我真的找不到真正的解决方案。
我的代码是这样的;
import discord
import requests
import feedparser
from bs4 import BeautifulSoup
blog_url = "https://blog.counter-strike.net/index.php/feed/"
upd_url = "https://blog.counter-strike.net/index.php/category/updates/feed/"
history_file = "history.txt"
h_file = open("history.txt", "a")
def scrap_rss(scrap_param):
article_list = []
try:
r = requests.get(scrap_param)
soup = BeautifulSoup(r.content, features='xml')
articles = soup.findAll('item')
postids = soup.findAll('post-id')
print('The scraping job succeeded: ', r.status_code)
for a in articles:
title = a.find('title').text
link = a.find('link').text
published = a.find('pubDate').text
postid = a.find('post-id').text
article = {
'title': title,
'link': link,
'published': published,
'post-id': postid
}
article_list.append(article)
with open(history_file) as f:
if postid in f.read():
print("true")
else:
print("false"+ postid)
h_file.write(postid+"\n")
#newpost = a.find(".//item/[post-id="+postid+"]/name")
#newpost = postids.find(text="29701")
return print(article_list)
except Exception as e:
print('The scraping job failed. See exception: ')
print(e)
print('Starting scraping')
scrap_rss(blog_url)
print('Finished scraping')
下面的代码正在寻找具有给定 post-id (31917) 的项目并提取 pubDate
import requests
import xml.etree.ElementTree as ET
r = requests.get('https://blog.counter-strike.net/index.php/feed/')
if r.status_code == 200:
root = ET.fromstring(r.text)
item = [item for item in root.findall('.//item') if
item.find('{com-wordpress:feed-additions:1}post-id').text == '31917'][0]
print(f'published at {item.find("pubDate").text}')
输出
published at Thu, 03 Dec 2020 22:13:08 +0000
我是初学者,所以为了提高自己,我正在做这些事情。
我正在尝试获取具有 ID 的特定 rss/xml 项目。
我想通过使用“post-id”获取特定的博客 post 内容,我该怎么做?
抱歉,如果这是一个常见问题,但我真的找不到真正的解决方案。
我的代码是这样的;
import discord
import requests
import feedparser
from bs4 import BeautifulSoup
blog_url = "https://blog.counter-strike.net/index.php/feed/"
upd_url = "https://blog.counter-strike.net/index.php/category/updates/feed/"
history_file = "history.txt"
h_file = open("history.txt", "a")
def scrap_rss(scrap_param):
article_list = []
try:
r = requests.get(scrap_param)
soup = BeautifulSoup(r.content, features='xml')
articles = soup.findAll('item')
postids = soup.findAll('post-id')
print('The scraping job succeeded: ', r.status_code)
for a in articles:
title = a.find('title').text
link = a.find('link').text
published = a.find('pubDate').text
postid = a.find('post-id').text
article = {
'title': title,
'link': link,
'published': published,
'post-id': postid
}
article_list.append(article)
with open(history_file) as f:
if postid in f.read():
print("true")
else:
print("false"+ postid)
h_file.write(postid+"\n")
#newpost = a.find(".//item/[post-id="+postid+"]/name")
#newpost = postids.find(text="29701")
return print(article_list)
except Exception as e:
print('The scraping job failed. See exception: ')
print(e)
print('Starting scraping')
scrap_rss(blog_url)
print('Finished scraping')
下面的代码正在寻找具有给定 post-id (31917) 的项目并提取 pubDate
import requests
import xml.etree.ElementTree as ET
r = requests.get('https://blog.counter-strike.net/index.php/feed/')
if r.status_code == 200:
root = ET.fromstring(r.text)
item = [item for item in root.findall('.//item') if
item.find('{com-wordpress:feed-additions:1}post-id').text == '31917'][0]
print(f'published at {item.find("pubDate").text}')
输出
published at Thu, 03 Dec 2020 22:13:08 +0000