Python 3.x 使用编码解码时不支持的操作数类型

Python 3.x unsupported operand type in using encode decode

我正在尝试为我的营销项目构建一个通用爬虫,并跟踪信息来自可视化博客、推荐等的位置。我正在使用 Python 3.5 和 Spyder/pycharm 作为 IDE 并且我在使用编码 - 解码时不断收到以下错误。我的代码的输入是 excel 文件中的公司名称和产品功能列表。我还搜索了可能的解决方案,但社区中的建议是关于类型转换的,我不确定这是问题所在。 如果我这边需要更多说明,请告诉我。

from __future__ import division, unicode_literals 
import codecs
import re
import os
import xlrd
import requests
from urllib.request import urlopen
from time import sleep
from bs4 import BeautifulSoup
import openpyxl
from collections import Counter

page=0
b=0
n=0
w=0
p=0
o=0
workbook=xlrd.open_workbook("C:\Product.xlsx")
workbook1=xlrd.open_workbook("C:\linkslist.xlsx")
sheet_names = workbook.sheet_names()
sheet_names1 = workbook1.sheet_names()
wb= openpyxl.Workbook() #User Spreadsheet
ws = wb.active
ws.title = "User"
ws['A1'] = 'Feature'
ws['B1'] = 'Customer-Testimonials'
ws['C1'] = 'Case Study'
ws['D1'] = 'Blog'
ws['E1'] = 'Press'
ws['F1'] = 'Total posts'
ws1 = wb.create_sheet(title="Ml")
ws1['A1'] = 'Feature'
ws1['B1'] = 'Phrase'
ws1['C1'] = 'Address'
ws1['D1'] = 'Tag Count'
worksheet = workbook.sheet_by_name(sheet_names[0])
worksheet1 = workbook1.sheet_by_name(sheet_names[0])
for linknumber in range(0,25):
    u = worksheet1.cell(linknumber,0).value
    url='www.' + u.lower() + '.com'
    print (url)
    r=''
    while r == '':
        try:
            print ("in loop")
            r  = requests.get("http://" +url)
        except:
            sleep(3)#if the code still gives that error then try increasing the sleep time to 5 maybe
    print (r)
    data = r.text
    #print data
    soup1 = BeautifulSoup(data, "html.parser")
    #print soup1
    num=3 #starting row number and keep the column same.
    word = ''
    word = worksheet.cell(num,3).value

    while not word == 'end':
        print (num)
        #print word
        tag_list=[]
        phrase= []
        counts=[]
        address=[]        
        counts = Counter(tag_list)
        for link in soup1.find_all('a'):
            #print link
            add = link.encode("ascii", "ignore")
            print (add) 
            if not'Log In' in add:
                #print link.get('href')
                i=0
                content = ''
                for i in range(1,5):
                    if content=='':
                        try:
                            print (link.get('href'))
                            i+=1
                            req = urllib.request.Request(link.get('href'))
                            with urllib.request.urlopen(req) as response:
                                content = response.read()    
                        except:
                            sleep(3)
                            #if the code still gives that error then try increasing the sleep time to 5 maybe
                            continue
                soup = BeautifulSoup(content, "html.parser") 
                s=soup(text=re.compile(word))
                if s:
                    print ("TRUE")
                    add = link.encode('ascii','ignore')
                    print (type(add))
                    if 'customer-testimonial' in add :
                        b+=1
                    elif 'case-study' in add :
                        n+=1
                    elif 'blog' in add :
                        w+=1  
                    elif 'press' in add :
                        p+=1
                    else :
                        o+=1
                    #phrase_type=["Customer testimonials","news","ads","twitter","facebook","instagram"]
                    #print(os.path.join(root, name))
                    print (add)
                    for tag in s:
                        parent_html = tag.parent.name 
                        print (parent_html)
                        tag_list.append(parent_html)
                    phrase.append(s)
                    address.append(add)
                    #print str(phrase)
                    counts = Counter(tag_list)
                    page +=1
                else:
                    counts = Counter(tag_list)
        no =num-1
        print(counts)
        print (word)
        ws['A%d'%no] = word.encode('utf-8' , 'ignore')
        ws1['A%d'%no] = word.encode('utf-8' , 'ignore')
        print ("Number of pages is %d" %page)
        print ("Number of Customer testimonials posts is %d" %b)
        ws['B%d'%no] = b
        print ("Number of Case Studies posts is %d" %n)
        ws['C%d'%no] = n
        print ("Number of blog posts is %d" %w)
        ws['D%d'%no] = w
        print ("Number of press posts is %d" %p)
        ws['E%d'%no] = p
        print ("Number of posts is %d" %page)
        ws['F%d'%no] = page
        ws1['B%d'%no] = phrase.encode('utf-8' , 'ignore')
        ws1['C%d'%no] = address.encode('utf-8' , 'ignore')
        ws1['D%d'%no] = counts.encode('utf-8' , 'ignore')
        counts.clear()
        num += 1
        word = worksheet.cell(num,3).value
        #print word
        page=0
        b=0
        n=0
        w=0
        p=0
        o=0
        phrase=[]
        address=[]
        tag_list=[]
wb.save('%s.xlsx'%worksheet1.cell(linknumber,0).value)

我在 运行 代码时得到以下输出和错误:

www.amobee.com
in loop
<Response [200]>
3
Traceback (most recent call last):
  File "C:/project_web_parser.py", line 69, in <module>
    add = link.encode("ascii", "ignore")
  File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1094, in encode
    u = self.decode(indent_level, encoding, formatter)
  File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1159, in decode
    indent_space = (' ' * (indent_level - 1))
TypeError: unsupported operand type(s) for -: 'str' and 'int'

Process finished with exit code 1

Traceback 在您尝试编码 link 的第 69 行显示错误。要修复它,只需将该行更改为:

add = link.encode("ascii", errors="ignore") 

为什么会这样?

您的 link 变量是 bs4.element.Tag

的类型
>>>type(link)
<class 'bs4.element.Tag'>
标签的

.encode() 方法比字符串的 .encode() 方法需要更多的参数。 在文件 \bs4\element.pybs4 的源代码中的第 1089 行,您可以找到它的定义:

def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
           indent_level=None, formatter="minimal",
           errors="xmlcharrefreplace"):

第一个参数是 encoding,第二个是 indent_levelintNone) 和 errors 处理。

错误

unsupported operand type(s) for -: 'str' and 'int'

表示您尝试减去 'ignore' - 1