在原始代码 python、selenium、re 中替换 url 时出现错误

I am getting error while replacing url in orginal code python,selenium,re

我在这个问题中使用了代码https://codereview.stackexchange.com/questions/241842/webscraping-with-selenium-a-course-downloader-and-sorter/248712#248712

我将该代码中的 url 替换为我的 url 当我编译时得到如下所示的错误

第 66 行,在 current_file_name = re.search(r'https://player.hdflixcore.workers.dev//0://课程//账号%20Cracking%20--MrSihag//TN%20Cracking%20课程%20- -MrSihag/.+/(.+)', download_path, re.DOTALL).group(1) AttributeError: 'NoneType' 对象没有属性 'group'

我想我在代码中使用了网站地址 在“current_file_name”中有一些额外的字母,如反斜杠
我不知道 我试着通过添加一些反斜杠来做同样的事情,但没有修复

但是当我 运行 原始代码时它工作正常 当我在我想要的网站上使用它时,它最终会出现上面提到的错误

下面是我编辑的代码


from selenium import webdriver
import time
import os
import shutil
import re

path = r'https://player.hdflixcore.workers.dev/0:/Courses/Account%20Cracking%20--MrSihag/TN%20Cracking%20Course%20--MrSihag/'

# For changing the download location for this browser temporarily
options = webdriver.ChromeOptions()
preferences = {"download.default_directory": r"C:\Users\shanid\Desktop\test", "safebrowsing.enabled": "false"}
options.add_experimental_option("prefs", preferences)

# Acquire the Course Link and Get all the directories
browser = webdriver.Chrome(chrome_options=options)
browser.get(r"https://player.hdflixcore.workers.dev/0:/Courses/Account%20Cracking%20--MrSihag/TN%20Cracking%20Course%20--MrSihag/")
time.sleep(2)
elements = browser.find_elements_by_css_selector(".mdui-text-truncate")

# loop for as many directories there are
for i in range(0, len(elements)):
    print("deft")

    # At each directory, it refreshes the page to update the webelements in the list, and returns the current directory that is being worked on
    browser.get(path)
    time.sleep(2)
    elements = browser.find_elements_by_css_selector(".mdui-text-truncate")
    element = elements[i]

    # checks if the folder for the directory already exists
    current_directory_name = element.text[11:].strip(" .")
    current_folder_path = "C:\Users\shanid\Desktop\test\" + current_directory_name
    if os.path.exists(current_folder_path):
        pass
    else:
        os.mkdir(current_folder_path)

    # Formatting what has been downloaded and sorted, and
    print(current_directory_name, "------------------------------", sep="\n")

    # moves on to the directory to get the page with the files
    element.click()

    # pausing for a few secs for the page to load, and running the same mechanism to get each file using the same method used in directory
    time.sleep(3)
    files = browser.find_elements_by_css_selector(".mdui-text-truncate")
    for j in range(len(files)):
        files = browser.find_elements_by_css_selector(".mdui-text-truncate")
        _file = files[j]
    # constants for some if statements
        download = True
        move = True
        current_file_name = _file.text[17:].strip()

    # If file exists, then pass over it, and don't do anything, and moveon to next file
        if os.path.exists(current_folder_path + "\" + current_file_name):
            pass

    # If it doesnt exist, then depending on its extension, do specific actions with it
        else:
            # Downloads the mp4 files by clicking on it, and finding the input tag which contains the download link for vid in its value attribute
            if ".mp4" in current_file_name:
                _file.click()
                time.sleep(2)
                download_path = browser.find_element_by_css_selector("input").get_attribute("value")
                current_file_name = re.search(r'https://player.hdflixcore.workers.dev//0://Courses//Account%20Cracking%20--MrSihag//TN%20Cracking%20Course%20--MrSihag/.+/(.+)', download_path, re.DOTALL).group(1)
                # Checks if file exists again, incase the filename is different then the predicted filename orderly generated.
                if os.path.exists(current_folder_path + "\" + current_file_name):
                    move = False
                    download = False
                # returns to the previous page with the files
                browser.back()

            # self explanatory
            elif ".html" in current_file_name:
                download_path = path + current_directory_name + "/" + current_file_name
                if os.path.exists(current_folder_path + "\" + current_file_name):
                    move = False
                    download = False

            else:
            # acquires the download location by going to the parent tag which is an a tag containing the link for html in its 'href' attribute
                download_path = _file.find_element_by_xpath('..').get_attribute('href').replace(r"%5E", "^")
                current_file_name = re.search(r'https://player.hdflixcore.workers.dev/0:/Courses/Account%20Cracking%20--MrSihag/TN%20Cracking%20Course%20--MrSihag/.+/(.+)', download_path, re.DOTALL).group(1).replace("%20", " ")

            time.sleep(2)
            current_file_path = "C:\Users\shanid\Desktop\test\" + current_file_name
            # responsible for downloading it using a path, get allows downloading, by source links
            if download:
                browser.get(download_path)

                # while the file doesn't exist/ it hasn't been downloaded yet, do nothing
                while True:
                    if os.path.exists(current_file_path):
                        break
                time.sleep(1)

            # moves the file from the download spot to its own folder
            if move:
                shutil.move(current_file_path, current_folder_path + "\" + current_file_name)
        print(current_file_name)

    # formatter
    print("------------------------------", "", sep="\n")
    time.sleep(3) 

下面是原代码


from selenium import webdriver
import time
import os
import shutil
import re

path = r'https://coursevania.courses.workers.dev/[coursevania.com]%20Udemy%20-%20Master%20the%20Coding%20Interview%20Data%20Structures%20+%20Algorithms/'

# For changing the download location for this browser temporarily
options = webdriver.ChromeOptions()
preferences = {"download.default_directory": r"E:\Utilities_and_Apps\Python\MY PROJECTS\Test data\Downloads", "safebrowsing.enabled": "false"}
options.add_experimental_option("prefs", preferences)

# Acquire the Course Link and Get all the directories
browser = webdriver.Chrome(chrome_options=options)
browser.get(r"https://coursevania.courses.workers.dev/[coursevania.com]%20Udemy%20-%20Master%20the%20Coding%20Interview%20Data%20Structures%20+%20Algorithms/")
time.sleep(2)
elements = browser.find_elements_by_css_selector(".mdui-text-truncate")

# loop for as many directories there are
for i in range(0, len(elements)):

    # At each directory, it refreshes the page to update the webelements in the list, and returns the current directory that is being worked on
    browser.get(path)
    time.sleep(2)
    elements = browser.find_elements_by_css_selector(".mdui-text-truncate")
    element = elements[i]

    # checks if the folder for the directory already exists
    current_directory_name = element.text[11:].strip(" .")
    current_folder_path = "E:\Utilities_and_Apps\Python\MY PROJECTS\Test data\Downloads\" + current_directory_name
    if os.path.exists(current_folder_path):
        pass
    else:
        os.mkdir(current_folder_path)

    # Formatting what has been downloaded and sorted, and 
    print(current_directory_name, "------------------------------", sep="\n")

    # moves on to the directory to get the page with the files
    element.click()

    # pausing for a few secs for the page to load, and running the same mechanism to get each file using the same method used in directory 
    time.sleep(3)
    files = browser.find_elements_by_css_selector(".mdui-text-truncate")
    for j in range(len(files)):
        files = browser.find_elements_by_css_selector(".mdui-text-truncate")
        _file = files[j]
    # constants for some if statements
        download = True
        move = True
        current_file_name = _file.text[17:].strip()

    # If file exists, then pass over it, and don't do anything, and moveon to next file
        if os.path.exists(current_folder_path + "\" + current_file_name):
            pass

    # If it doesnt exist, then depending on its extension, do specific actions with it 
        else:
            # Downloads the mp4 files by clicking on it, and finding the input tag which contains the download link for vid in its value attribute
            if ".mp4" in current_file_name:
                _file.click()
                time.sleep(2)  
                download_path = browser.find_element_by_css_selector("input").get_attribute("value")
                current_file_name = re.search(r'https://coursevania.courses.workers.dev/\[coursevania.com\]%20Udemy%20-%20Master%20the%20Coding%20Interview%20Data%20Structures%20\+%20Algorithms/.+/(.+)', download_path, re.DOTALL).group(1)
                # Checks if file exists again, incase the filename is different then the predicted filename orderly generated.
                if os.path.exists(current_folder_path + "\" + current_file_name):
                    move = False
                    download = False
                # returns to the previous page with the files
                browser.back()

            # self explanatory
            elif ".html" in current_file_name:
                download_path = path + current_directory_name + "/" + current_file_name
                if os.path.exists(current_folder_path + "\" + current_file_name):
                    move = False
                    download = False

            else:
            # acquires the download location by going to the parent tag which is an a tag containing the link for html in its 'href' attribute
                download_path = _file.find_element_by_xpath('..').get_attribute('href').replace(r"%5E", "^")
                current_file_name = re.search(r'https://coursevania.courses.workers.dev/\[coursevania.com\]%20Udemy%20-%20Master%20the%20Coding%20Interview%20Data%20Structures%20\+%20Algorithms/.+/(.+)', download_path, re.DOTALL).group(1).replace("%20", " ")

            time.sleep(2)
            current_file_path = "E:\Utilities_and_Apps\Python\MY PROJECTS\Test data\Downloads\" + current_file_name
            # responsible for downloading it using a path, get allows downloading, by source links
            if download:
                browser.get(download_path)

                # while the file doesn't exist/ it hasn't been downloaded yet, do nothing
                while True:
                    if os.path.exists(current_file_path):
                        break
                time.sleep(1)

            # moves the file from the download spot to its own folder
            if move:
                shutil.move(current_file_path, current_folder_path + "\" + current_file_name)
        print(current_file_name)

    # formatter
    print("------------------------------", "", sep="\n")
    time.sleep(3)

这段代码工作正常

但是当我将网站更改为 https://player.hdflixcore.workers.dev/0:/Courses/Account%20Cracking%20--MrSihag/TN%20Cracking%20Course%20--MrSihag/

时无法正常工作

我使用的网站是原始网站的克隆

我不知道为什么会出错

问题出在下页输入框的 CSS 选择器上。 https://player.hdflixcore.workers.dev/0:/Courses/Account%20Cracking%20--MrSihag/TN%20Cracking%20Course%20--MrSihag/01%20Course%20Introduction/1%20Course%20Introduction.mp4?a=view

页面上有2个输入框,所以CSS路径要写成"#content > div > div:nth-child(6) > input".

代码有问题。

download_path = browser.find_element_by_css_selector("input").get_attribute("value")

将替换为。

download_path = browser.find_element_by_css_selector("#content > div > div:nth-child(6) > input").get_attribute("value")