如何从文件中删除重复行并将它们写入 Python 中的文件

How to remove duplicate lines from a file and writing them in file in Python

下面的代码获取 HTML 文件中的所有链接并将其写入文本文件。但它也在复制重复的行(链接)。有什么方法可以确保它不会写入文件中已经存在的 link 吗?任何方法,这样我就不必手动编写功能代码了?

from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):

    def handle_starttag(self,tag,attrs):
        if tag=="a":
            if attrs.__len__>0:
                for a in attrs:
                    if a[0]=="href":
                        print a[1]
                        f=open("index_link.txt","a+")
                        f.write(a[1]+"\n")

def main():

    parser=MyHTMLParser()
    f=open("index.html")
    if f.mode=="r":
        contents=f.read()
        parser.feed(contents)
    else:
        print ("No file found")
    f=open("textfile.html","w+")
    f.write(contents)

if __name__=="__main__":
    main()

使用set()。与其将链接直接写入文件(无论如何效率很低),不如试试这个:

class MyHTMLParser(HTMLParser):
    def __init(self)__:
        super(HTMLParser, self).__init__()
        self.my_links = set()

    def handle_starttag(self,tag,attrs):
        if tag != "a" or attrs.__len__ == 0:
            return None

        for a in attrs:
            if a[0] == "href":
                self.my_links.add(a[1])

然后检索链接:

parser = MyParser() 
# ... do your parsing here
links = parser.my_links 
with open('path/to/file', 'w') as f:
    for link in list(links):
        f.write(link)

您需要自己记录找到的链接,例如set:

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.links_found = set()

    def handle_starttag(self,tag,attrs):
        if tag=="a" and attrs:
            for a in attrs:
                if a[0]=="href" and a[1] not in self.links_found:
                    self.links_found.add(a[1])
                    print a[1]
                    with open("index_link.txt","a+") as f:
                        f.write(a[1]+"\n")

你也可以使用一个简单的列表,如果你想按顺序保留链接,而不是直接将它们写入文件:

from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.found_links = []

    def handle_starttag(self,tag,attrs):
        if tag=="a":
            attrs = dict(attrs)
            if "href" in attrs and attrs["href"] not in self.found_links:
                self.found_links.append(attrs["href"])

def main():
    parser = MyHTMLParser()
    with open("index.html") as f:
        contents = f.read()
    parser.feed(contents)
    with open("index_link.txt","w") as f:
        f.write('\n'.join(parser.found_links) + '\n')
    with open("textfile.html","w") as f:
        f.write(contents)

if __name__=="__main__":
    main()

使用列表数据类型很简单,这将是链接列表,这里我使用html_links变量

from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):

    def __init__(self):
        super(self).__init__
        self.html_links = []
    def handle_starttag(self,tag,attrs):
        if tag=="a":
            if attrs.__len__>0:
                for a in attrs:
                    if a[0]=="href" and a[1] not in self.html_links:
                        print a[1]
                        self.html_links.append(a[1])
                        f=open("index_link.txt","a+")
                        f.write(a[1]+"\n")

def main():
    parser=MyHTMLParser()
    f=open("index.html")
    if f.mode=="r":
        contents=f.read()
        parser.feed(contents)
    else:
        print ("No file found")
    f=open("textfile.html","w+")
    f.write(contents)

if __name__=="__main__":
    main()