Unable to rectify - ValueError: unknown url type: Link
Unable to rectify - ValueError: unknown url type: Link
我目前正在 运行 此代码将文章 url 链接抓取到一个 csv 文件中,并访问这些 urls(在 csv 文件中)以将相应信息抓取到一个文本文件。
我能够抓取到 csv 文件的链接,但我无法访问 csv 文件以抓取更多信息(文本文件也未创建)并且我遇到了 ValueError
import csv
from lxml import html
from time import sleep
import requests
from bs4 import BeautifulSoup
import urllib
import urllib2
from random import randint
outputFile = open("All_links.csv", r'wb')
fileWriter = csv.writer(outputFile)
fileWriter.writerow(["Link"])
#fileWriter.writerow(["Sl. No.", "Page Number", "Link"])
url1 = 'https://www.marketingweek.com/page/'
url2 = '/?s=big+data'
sl_no = 1
#iterating from 1st page through 361th page
for i in xrange(1, 361):
#generating final url to be scraped using page number
url = url1 + str(i) + url2
#Fetching page
response = requests.get(url)
sleep(randint(10, 20))
#using html parser
htmlContent = html.fromstring(response.content)
#Capturing all 'a' tags under h2 tag with class 'hentry-title entry-title'
page_links = htmlContent.xpath('//div[@class = "archive-constraint"]//h2[@class = "hentry-title entry-title"]/a/@href')
for page_link in page_links:
print page_link
fileWriter.writerow([page_link])
sl_no += 1
with open('All_links.csv', 'rb') as f1:
f1.seek(0)
reader = csv.reader(f1)
for line in reader:
url = line[0]
soup = BeautifulSoup(urllib2.urlopen(url))
with open('LinksOutput.txt', 'a+') as f2:
for tag in soup.find_all('p'):
f2.write(tag.text.encode('utf-8') + '\n')
这是我遇到的错误:
File "c:\users\rrj17\documents\visual studio 2015\Projects\webscrape\webscrape\webscrape.py", line 47, in <module>
soup = BeautifulSoup(urllib2.urlopen(url))
File "C:\Python27\lib\urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 421, in open
protocol = req.get_type()
File "C:\Python27\lib\urllib2.py", line 283, in get_type
raise ValueError, "unknown url type: %s" % self.__original
ValueError: unknown url type: Link
就此请求帮助。
尝试跳过 csv
文件中的第一行...您可能在不知不觉中尝试解析 header.
with open('All_links.csv', 'rb') as f1:
reader = csv.reader(f1)
next(reader) # read the header and send it to oblivion
for line in reader: # NOW start reading
...
您也不需要 f1.seek(0)
,因为 f1
在读取模式下自动指向文件的开头。
我目前正在 运行 此代码将文章 url 链接抓取到一个 csv 文件中,并访问这些 urls(在 csv 文件中)以将相应信息抓取到一个文本文件。
我能够抓取到 csv 文件的链接,但我无法访问 csv 文件以抓取更多信息(文本文件也未创建)并且我遇到了 ValueError
import csv
from lxml import html
from time import sleep
import requests
from bs4 import BeautifulSoup
import urllib
import urllib2
from random import randint
outputFile = open("All_links.csv", r'wb')
fileWriter = csv.writer(outputFile)
fileWriter.writerow(["Link"])
#fileWriter.writerow(["Sl. No.", "Page Number", "Link"])
url1 = 'https://www.marketingweek.com/page/'
url2 = '/?s=big+data'
sl_no = 1
#iterating from 1st page through 361th page
for i in xrange(1, 361):
#generating final url to be scraped using page number
url = url1 + str(i) + url2
#Fetching page
response = requests.get(url)
sleep(randint(10, 20))
#using html parser
htmlContent = html.fromstring(response.content)
#Capturing all 'a' tags under h2 tag with class 'hentry-title entry-title'
page_links = htmlContent.xpath('//div[@class = "archive-constraint"]//h2[@class = "hentry-title entry-title"]/a/@href')
for page_link in page_links:
print page_link
fileWriter.writerow([page_link])
sl_no += 1
with open('All_links.csv', 'rb') as f1:
f1.seek(0)
reader = csv.reader(f1)
for line in reader:
url = line[0]
soup = BeautifulSoup(urllib2.urlopen(url))
with open('LinksOutput.txt', 'a+') as f2:
for tag in soup.find_all('p'):
f2.write(tag.text.encode('utf-8') + '\n')
这是我遇到的错误:
File "c:\users\rrj17\documents\visual studio 2015\Projects\webscrape\webscrape\webscrape.py", line 47, in <module>
soup = BeautifulSoup(urllib2.urlopen(url))
File "C:\Python27\lib\urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 421, in open
protocol = req.get_type()
File "C:\Python27\lib\urllib2.py", line 283, in get_type
raise ValueError, "unknown url type: %s" % self.__original
ValueError: unknown url type: Link
就此请求帮助。
尝试跳过 csv
文件中的第一行...您可能在不知不觉中尝试解析 header.
with open('All_links.csv', 'rb') as f1:
reader = csv.reader(f1)
next(reader) # read the header and send it to oblivion
for line in reader: # NOW start reading
...
您也不需要 f1.seek(0)
,因为 f1
在读取模式下自动指向文件的开头。