Python 从 xml 中删除标签 br 和其他标签

Question

我正在拆分 XML 这个 link https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms 成很多 xml 根据标题

#Python code to illustrate parsing of XML files
# importing the required modules
import requests
import xml.tree.ElementTree as ET

def loadRSS():
  
    # url of rss feed
    url = "https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms"
  
    # creating HTTP response object from given url
    resp = requests.get(url)
  
    # saving the xml file
    with open('topnewsfeed.xml', 'wb') as f:
        f.write(resp.content)
     

def wire_xml(filename):
    context = ET.iterparse(filename, events=('end', ))
    for event, elem in context:
        if elem.tag == 'article':
            title = elem.find('headline').text
            out_filename = format(title + ".xml")
            with open('./xml/'+out_filename, 'wb') as f:
                # f.write(("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"))
                f.write(ET.tostring(elem))  
      
def main():
    # load rss from web to update existing xml file
    loadRSS()
 
    # store news items in a xml file
    wire_xml('topnewsfeed.xml') 
      
if __name__ == "__main__":
  
    # calling main function
    main()

上面的代码可以工作，但是他们有一个 2 问题

1.content(text) in xml 有无用的标签如何删除这个标签示例：-

 <content><div class="section1"><div class="Normal">HYDERABAD: Bharat Biotech on Friday said it has committed to supply over 500 million doses of its Covid-19 vaccine Covaxin to the Centre under the countrywide immunisation programme.<br/><br/>Speaking at a virtual conference organised by the Confederation of Indian Industry, Suchitra Ella, joint Managing Director of the city-based vaccine maker, said the company's facilities in four cities - Hyderabad, Bengaluru, Pune, and Ankaleshwar - are currently producing Covaxin.<br/><br/>"

2.how 根据我要求的示例更改标签：-

<?xml version="1.0" encoding="UTF-8"?>

-<nitf>


-<head>

<title>Ukraine Black Sea ports resume grain operations</title>


-<iim ver="3">

<ds value="" num="1:20"/>

<ds value="Reuter" num="1:30"/>

<ds value="" num="1:40"/>

<ds value="REU" num="1:50"/>

<ds value="20210723" num="1:70"/>

<ds value="055600+0000" num="1:80"/>

<ds value="Reuter.2021-07-23T055600Z_528892025_L1N2OZ07W_RTRMADT_0_GRAINS-UKRAINE-PORTS.XML" num="2:05"/>

<ds value="" num="2:07"/>

<ds value="3" num="2:10"/>

<ds value="OEC" num="2:15"/>

<ds value="" num="2:20"/>

<ds value="" num="2:22"/>

<ds value="GRAINS-UKRAINE/PORTS" num="2:25"/>

<ds value="" num="2:50"/>

<ds value="20210723" num="2:55"/>

<ds value="" num="2:80"/>

<ds value="" num="2:85"/>

<ds value="" num="2:90"/>

<ds value="" num="2:95"/>

<ds value="" num="2:101"/>

<ds value="Ukraine Black Sea ports resume grain operations" num="2:105"/>

<ds value="Reuter" num="2:110"/>

<ds value="Reuter" num="2:115"/>

<ds value="KYIV, July 23 (Reuters) - All Ukraine major Black Sea ports are working in normal mode, resuming operations affected by poor weather on Thursday, the state seaport authority said on Friday." num="2:120"/>

</iim>

</head>


-<body>


-<body.content>

<p>KYIV, July 23 (Reuters) - All Ukraine major Black Sea ports are working in normal mode, resuming operations affected by poor weather on Thursday, the state seaport authority said on Friday.</p>

<p>The restrictions of grain-loading operations had applied to the ports of Odesa, Chornomorsk, Mykolayiv, and Pivdeny.</p>

<p>Ukraine is among the world's biggest global grain exporters and plans to ship about 56 million tonnes of grain in the 2021/22 season. (Reporting by Pavel Polityuk)</p>

</body.content>

</body>

</nitf>

上面的格式xml我不会全部保存

Answer 1

Python 用于说明解析 XML 个文件的代码

# importing the required modules
import re as re
import xml.etree.ElementTree as ET
import pandas as pd
from urllib.request import Request, urlopen
import configparser
import os

def loadRSS():
    try:
        # url of rss feed
        url = "https://timesofindia.indiatimes.com/toirssfeed/-2128936835.cms"
        # creating HTTP response object from given url
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        #saving the url data in xml byte form
        web_byte = urlopen(req)
        #return data 
        return web_byte
    except OSError as e:
        print("Error in connecting TIL site :- ",e)
        input("Press andy to Close")

def parseXML(xmlfile):    
    news=[]
  # create element tree object
    tree = ET.parse(xmlfile)
    # get root element
    root = tree.getroot()
    # iterate through each node of the tree
    for node in root: 
        s_article  = node.attrib.get("ID")
        s_headline = node.find("headline").text
        s_imagename = node.find("imagename").text
        s_content = node.find("content").text
        s_summary = node.find("summary").text
        s_caption = node.find("caption").text
        s_cats = node.find("cats").text
        #update data in news list 
        news.append({"ID": s_article, "headline": s_headline, 
            "imagename": s_imagename, "content": s_content,
            "summary": s_summary, "caption": s_caption,"cats":s_cats})
    #return data in form of list 
    return news

def savetodf(newsitems):
    #difining Data fram columns 
    df_cols  = ['ID','headline', 'imagename', 'content', 'caption', 'summary','cats']
    #making data fram 
    out_df = pd.DataFrame(newsitems, columns = df_cols)
    #removing unwanted chrater form content
    out_df['content']=out_df['content'].apply(lambda cw :re.sub('<.*?>','',cw))
    #returning data frame 
    return out_df

def define_filename(filename):
    #Defining file name of each news which save in xml 
    config = configparser.ConfigParser()
    config.read('path.ini')
    for section_name in config.sections():
        for name, value in config.items(section_name):
            if name=='default_path':
                default_path=value
        file_formate="xml"
        return os.path.join(default_path,filename + "." + file_formate)

def build_item_xml(row):
    #defining new xml as per CCI sturctuer
    items = ET.Element('nitf')
    #defining Head and other attributes 
    head = ET.SubElement(items,'head')
    title = ET.SubElement(head,'title')
    title.text=row["headline"]
    country=ET.SubElement(head,'cats')
    country.text=row["cats"]
    item1=ET.SubElement(head,'iim', ver='3')
    ET.SubElement(item1, 'ds num="1:20"', value="79")
    #This is important attribute to Import in CCI
    ET.SubElement(item1, 'ds num="1:30"',value="TOIOnline")
#   ET.SubElement(item1, 'ds num="1:80"',value="113052+0000")
#   ET.SubElement(item1, 'ds num="2:10"',value="3")
    ET.SubElement(item1, 'ds num="2:20"',value="TOIOnline")
    ET.SubElement(item1, 'ds num="2:25"',value=row["headline"])
    ET.SubElement(item1, 'ds num="2:105"',value=row["headline"])
    #savine content in body of xml
    body=ET.SubElement(items, 'body')
    content= ET.SubElement(body, 'body.content')
    content.text=row["content"]
    tree = ET.ElementTree(items)
    #riting in XMl 
    tree.write(define_filename(row['ID']),encoding='utf-8',xml_declaration=True)
    #returning in form of row 
    return row  

def main():
    # load rss from web to update existing xml file
    lodrss=loadRSS()
    # parse xml file
    newsitems = parseXML(lodrss)
    # store news items in a datafram|
    df=savetodf(newsitems)
    # this calls build_item_xml per row
    df.apply(build_item_xml, axis=1)
         
if __name__ == "__main__":
#     pd.set_option('display.max_colwidth', -1)
    # calling main function
    main()

1.content(text) in xml 有无用的标签如何删除这个标签示例：-

更好的方法是将输入提要保存在数据框中。然后你可以删除你的标签

#removing unwanted chrater form content
    out_df['content']=out_df['content'].apply(lambda cw :re.sub('<.*?>','',cw))

2.how 根据我要求的示例更改标签：-

您需要从下面的数据框中创建一个新的 XML 文件并将其保存在不同的 XML 中，名称为

def build_item_xml(row):
        #defining new xml as per CCI sturctuer
        items = ET.Element('nitf')
        #defining Head and other attributes 
        head = ET.SubElement(items,'head')
        title = ET.SubElement(head,'title')
        title.text=row["headline"]
        country=ET.SubElement(head,'cats')
        country.text=row["cats"]
        item1=ET.SubElement(head,'iim', ver='3')
        ET.SubElement(item1, 'ds num="1:20"', value="79")
        #This is important attribute to Import in CCI
        ET.SubElement(item1, 'ds num="1:30"',value="TOIOnline")
    #   ET.SubElement(item1, 'ds num="1:80"',value="113052+0000")
    #   ET.SubElement(item1, 'ds num="2:10"',value="3")
        ET.SubElement(item1, 'ds num="2:20"',value="TOIOnline")
        ET.SubElement(item1, 'ds num="2:25"',value=row["headline"])
        ET.SubElement(item1, 'ds num="2:105"',value=row["headline"])
        #savine content in body of xml
        body=ET.SubElement(items, 'body')
        content= ET.SubElement(body, 'body.content')
        content.text=row["content"]
        tree = ET.ElementTree(items)
        #riting in XMl 
        tree.write(define_filename(row['ID']),encoding='utf-8',xml_declaration=True)
        #returning in form of row 
        return row

Python 从 xml 中删除标签 br 和其他标签

Python remove tag br and other tags from xml

rss

elementtree

xml-parsing

python-3.x

Python 用于说明解析 XML 个文件的代码