BeautifulSoup 问题和请求解析
Issue with BeautifulSoup and requests parses
我在尝试使用 BeautifulSoup 和请求模块时遇到错误。
我的代码如下:
import requests
from bs4 import BeautifulSoup
def get_html(url):
url = ('https://m.vk.com/bageto?act=members&offset=0')
r = requests.get(url)
return r.text
def get_total_pages(get_html):
soup = BeautifulSoup(get_html, 'lxml')
pages = soup.find('div', class_='pagination').find_all('a', class_='pg_link')[-1].get('href')
total_pages = pages.split('=')[2]
return int(total_pages)
def main():
base_url = 'https://m.vk.com/bageto?act=members&offset='`enter code here`
total_pages = get_total_pages(get_html)
for i in range(50, total_pages, 50):
print (i)
这会产生错误:
C:\Users\PANDEMIC\Desktop\Python-Test>vkp.py Traceback (most recent call last):
File "C:\Users\PANDEMIC\Desktop\Python-Test\vkp.py",
line 23, in <module>
main()
File "C:\Users\PANDEMIC\Desktop\Python-Test\vkp.py", line 20, in main
total_pages = get_total_pages(get_html)
File "C:\Users\PANDEMIC\Desktop\Python-Test\vkp.py", line 13, in get_total_pages
soup = BeautifulSoup(get_html, 'lxml')
File "C:\Users\PANDEMIC\AppData\Local\Programs\Python\Python36-32\lib\site-packages\bs4\__init__.py", line 192, in __init__
elif len(markup) <= 256 and (
TypeError: object of type 'function' has no len()
您在
中执行 get_html()
时忘记了 ()
和参数
total_pages = get_total_pages( get_html(base_url) )
顺便说一句: 你在 get_html
中不需要 url
因为它会在下一次调用中覆盖你的参数
def get_html(url):
#url = ('https://m.vk.com/bageto?act=members&offset=0')
r = requests.get(url)
return r.text
或者您可以使用默认值
def get_html(url='https://m.vk.com/bageto?act=members&offset=0')
r = requests.get(url)
return r.text
完整版 base_url+"0"
作为 get_html(base_url+"0")
中的参数
import requests
from bs4 import BeautifulSoup
def get_html(url):
#url = ('https://m.vk.com/bageto?act=members&offset=0')
r = requests.get(url)
return r.text
def get_total_pages(html):
soup = BeautifulSoup(html, 'lxml')
pages = soup.find('div', class_='pagination').find_all('a', class_='pg_link')[-1].get('href')
total_pages = pages.split('=')[2]
return int(total_pages)
def main():
base_url = 'https://m.vk.com/bageto?act=members&offset='
total_pages = get_total_pages(get_html(base_url+"0"))
print(total_pages)
for i in range(50, total_pages, 50):
print(i)
#print(base_url + str(i))
main()
import requests
from bs4 import BeautifulSoup
def get_html(url):
url = ('https://m.vk.com/bageto?act=members&offset=0')
r = requests.get(url)
return r.text
def get_total_pages(html):
soup = BeautifulSoup(html, 'lxml')
pages = soup.find('div', class_='pagination').find_all('a', class_='pg_link')[-1].get('href')
total_pages = pages.split('=')[2]
return int(total_pages)
def main():
base_url = 'https://m.vk.com/bageto?act=members&offset=0'
html = get_html(base_url)
total_pages = get_total_pages(html)
print(total_pages)
您应该将 html 字符串传递给 BeautifulSoup,而不是函数。
def main():
try:
urll = []
base_url = 'https://m.vk.com/bageto?act=members&offset='
total_pages = int(get_total_pages(get_html(url)))
for i in range(0, total_pages, 50):
url_gen = str(base_url + str(i))
urll.append(url_gen)
#get_page_data(url_gen)
pool = ThreadPool(8)
results = pool.map(get_page_data, urll)
except KeyboardInterrupt:
print('you are stopped script yourself')
if __name__ == '__main__':
main()
我在尝试使用 BeautifulSoup 和请求模块时遇到错误。
我的代码如下:
import requests
from bs4 import BeautifulSoup
def get_html(url):
url = ('https://m.vk.com/bageto?act=members&offset=0')
r = requests.get(url)
return r.text
def get_total_pages(get_html):
soup = BeautifulSoup(get_html, 'lxml')
pages = soup.find('div', class_='pagination').find_all('a', class_='pg_link')[-1].get('href')
total_pages = pages.split('=')[2]
return int(total_pages)
def main():
base_url = 'https://m.vk.com/bageto?act=members&offset='`enter code here`
total_pages = get_total_pages(get_html)
for i in range(50, total_pages, 50):
print (i)
这会产生错误:
C:\Users\PANDEMIC\Desktop\Python-Test>vkp.py Traceback (most recent call last):
File "C:\Users\PANDEMIC\Desktop\Python-Test\vkp.py",
line 23, in <module>
main()
File "C:\Users\PANDEMIC\Desktop\Python-Test\vkp.py", line 20, in main
total_pages = get_total_pages(get_html)
File "C:\Users\PANDEMIC\Desktop\Python-Test\vkp.py", line 13, in get_total_pages
soup = BeautifulSoup(get_html, 'lxml')
File "C:\Users\PANDEMIC\AppData\Local\Programs\Python\Python36-32\lib\site-packages\bs4\__init__.py", line 192, in __init__
elif len(markup) <= 256 and (
TypeError: object of type 'function' has no len()
您在
中执行get_html()
时忘记了 ()
和参数
total_pages = get_total_pages( get_html(base_url) )
顺便说一句: 你在 get_html
中不需要 url
因为它会在下一次调用中覆盖你的参数
def get_html(url):
#url = ('https://m.vk.com/bageto?act=members&offset=0')
r = requests.get(url)
return r.text
或者您可以使用默认值
def get_html(url='https://m.vk.com/bageto?act=members&offset=0')
r = requests.get(url)
return r.text
完整版 base_url+"0"
作为 get_html(base_url+"0")
import requests
from bs4 import BeautifulSoup
def get_html(url):
#url = ('https://m.vk.com/bageto?act=members&offset=0')
r = requests.get(url)
return r.text
def get_total_pages(html):
soup = BeautifulSoup(html, 'lxml')
pages = soup.find('div', class_='pagination').find_all('a', class_='pg_link')[-1].get('href')
total_pages = pages.split('=')[2]
return int(total_pages)
def main():
base_url = 'https://m.vk.com/bageto?act=members&offset='
total_pages = get_total_pages(get_html(base_url+"0"))
print(total_pages)
for i in range(50, total_pages, 50):
print(i)
#print(base_url + str(i))
main()
import requests
from bs4 import BeautifulSoup
def get_html(url):
url = ('https://m.vk.com/bageto?act=members&offset=0')
r = requests.get(url)
return r.text
def get_total_pages(html):
soup = BeautifulSoup(html, 'lxml')
pages = soup.find('div', class_='pagination').find_all('a', class_='pg_link')[-1].get('href')
total_pages = pages.split('=')[2]
return int(total_pages)
def main():
base_url = 'https://m.vk.com/bageto?act=members&offset=0'
html = get_html(base_url)
total_pages = get_total_pages(html)
print(total_pages)
您应该将 html 字符串传递给 BeautifulSoup,而不是函数。
def main():
try:
urll = []
base_url = 'https://m.vk.com/bageto?act=members&offset='
total_pages = int(get_total_pages(get_html(url)))
for i in range(0, total_pages, 50):
url_gen = str(base_url + str(i))
urll.append(url_gen)
#get_page_data(url_gen)
pool = ThreadPool(8)
results = pool.map(get_page_data, urll)
except KeyboardInterrupt:
print('you are stopped script yourself')
if __name__ == '__main__':
main()