Issue with web crawler: IndexError: string index out of range
Issue with web crawler: IndexError: string index out of range
我正在制作网络爬虫。我没有使用 scrapy 或任何东西,我试图让我的脚本做大部分事情。我曾尝试搜索该问题,但我似乎找不到任何有助于解决该错误的信息。我试过切换一些变量来缩小问题范围。我在第 24 行收到一条错误消息,提示 IndexError: string index out of range。函数 运行 在第一个 url(原来的 url)然后是第二个,在原始数组的第三个上失败。我迷路了,任何帮助将不胜感激!请注意,我只是将它们全部打印出来进行测试,最终我会将它们打印到一个文本文件中。
import requests
from bs4 import BeautifulSoup
# creating requests from user input
url = raw_input("Please enter a domain to crawl, without the 'http://www' part : ")
def makeRequest(url):
r = requests.get('http://' + url)
# Adding in BS4 for finding a tags in HTML
soup = BeautifulSoup(r.content, 'html.parser')
# Writes a as the link found in the href
output = soup.find_all('a')
return output
def makeFilter(link):
# Creating array for our links
found_link = []
for a in link:
a = a.get('href')
a_string = str(a)
# if statement to filter our links
if a_string[0] == '/': # this is the line with the error
# Realtive Links
found_link.append(a_string)
if 'http://' + url in a_string:
# Links from the same site
found_link.append(a_string)
if 'https://' + url in a_string:
# Links from the same site with SSL
found_link.append(a_string)
if 'http://www.' + url in a_string:
# Links from the same site
found_link.append(a_string)
if 'https://www.' + url in a_string:
# Links from the same site with SSL
found_link.append(a_string)
#else:
# found_link.write(a_string + '\n') # testing only
output = found_link
return output
# Function for removing duplicates
def remove_duplicates(values):
output = []
seen = set()
for value in values:
if value not in seen:
output.append(value)
seen.add(value)
return output
# Run the function with our list in this order -> Makes the request -> Filters the links -> Removes duplicates
def createURLList(values):
requests = makeRequest(values)
new_list = makeFilter(requests)
filtered_list = remove_duplicates(new_list)
return filtered_list
result = createURLList(url)
# print result
# for verifying and crawling resulting pages
for b in result:
sub_directories = createURLList(url + b)
crawler = []
crawler.append(sub_directories)
print crawler
在 a_string = str(a)
之后尝试添加:
if not a_string:
continue
我正在制作网络爬虫。我没有使用 scrapy 或任何东西,我试图让我的脚本做大部分事情。我曾尝试搜索该问题,但我似乎找不到任何有助于解决该错误的信息。我试过切换一些变量来缩小问题范围。我在第 24 行收到一条错误消息,提示 IndexError: string index out of range。函数 运行 在第一个 url(原来的 url)然后是第二个,在原始数组的第三个上失败。我迷路了,任何帮助将不胜感激!请注意,我只是将它们全部打印出来进行测试,最终我会将它们打印到一个文本文件中。
import requests
from bs4 import BeautifulSoup
# creating requests from user input
url = raw_input("Please enter a domain to crawl, without the 'http://www' part : ")
def makeRequest(url):
r = requests.get('http://' + url)
# Adding in BS4 for finding a tags in HTML
soup = BeautifulSoup(r.content, 'html.parser')
# Writes a as the link found in the href
output = soup.find_all('a')
return output
def makeFilter(link):
# Creating array for our links
found_link = []
for a in link:
a = a.get('href')
a_string = str(a)
# if statement to filter our links
if a_string[0] == '/': # this is the line with the error
# Realtive Links
found_link.append(a_string)
if 'http://' + url in a_string:
# Links from the same site
found_link.append(a_string)
if 'https://' + url in a_string:
# Links from the same site with SSL
found_link.append(a_string)
if 'http://www.' + url in a_string:
# Links from the same site
found_link.append(a_string)
if 'https://www.' + url in a_string:
# Links from the same site with SSL
found_link.append(a_string)
#else:
# found_link.write(a_string + '\n') # testing only
output = found_link
return output
# Function for removing duplicates
def remove_duplicates(values):
output = []
seen = set()
for value in values:
if value not in seen:
output.append(value)
seen.add(value)
return output
# Run the function with our list in this order -> Makes the request -> Filters the links -> Removes duplicates
def createURLList(values):
requests = makeRequest(values)
new_list = makeFilter(requests)
filtered_list = remove_duplicates(new_list)
return filtered_list
result = createURLList(url)
# print result
# for verifying and crawling resulting pages
for b in result:
sub_directories = createURLList(url + b)
crawler = []
crawler.append(sub_directories)
print crawler
在 a_string = str(a)
之后尝试添加:
if not a_string:
continue