使用 BeautifulSoup 和 Python 2.7 使用 Google 登录网站
Login to a website with Google using BeautifulSoup and Python 2.7
我正在为 Quora 编写 Python 网络爬虫,但需要使用 Google 登录。我已经在网上搜索过,但没有什么能满足我的问题。这是我的代码:
# -*- coding: utf-8 -*-
import mechanize
import os
import requests
import urllib2
from bs4 import BeautifulSoup
import cookielib
# Store the cookies and create an opener that will hold them
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
# Add our headers
opener.addheaders = [('User-agent', 'RedditTesting')]
# Install our opener (note that this changes the global opener to the one
# we just made, but you can also just call opener.open() if you want)
urllib2.install_opener(opener)
# The action/ target from the form
authentication_url = 'https://quora.com'
# Input parameters we are going to send
payload = {
'op': 'login-main',
'user': '<username>',
'passwd': '<password>'
}
# Use urllib to encode the payload
data = urllib.urlencode(payload)
# Build our Request object (supplying 'data' makes it a POST)
req = urllib2.Request(authentication_url, data)
# Make the request and read the response
resp = urllib2.urlopen(req)
contents = resp.read()
# specify the url
quote_page = "https://www.quora.com/"
# query the website and return the html to the variable ‘page’
page = urllib2.urlopen(quote_page)
# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page, 'html.parser')
# Take out the <div> of name and get its value
name_box = soup.find('div', attrs={"class": "ContentWrapper"})
name = name_box.text.strip() # strip() is used to remove starting and trailing
print name
for link in soup.find_all('img'):
image = link.get("src")
os.path.split(image)
image_name = os.path.split(image)[1]
print(image_name)
r2 = requests.get(image)
with open(image_name, "wb") as f:
f.write(r2.content)
由于我没有该站点的任何实际用户名,因此我使用自己的 Gmail 帐户。为了登录,我使用了来自不同问题的一些代码,但这不起作用。
任何缩进错误都是由于我糟糕的格式造成的。
要登录和抓取,请使用会话;使用您的凭据作为有效载荷发出 POST 请求,然后抓取。
import requests
from bs4 import BeautifulSoup
with requests.Session() as s:
p = s.post("https://quora.com", data={
"email": '*******',
"password": "*************"
})
print(p.text)
base_page = s.get('https://quora.com')
soup = BeautifulSoup(base_page.content, 'html.parser')
print(soup.title)
我正在为 Quora 编写 Python 网络爬虫,但需要使用 Google 登录。我已经在网上搜索过,但没有什么能满足我的问题。这是我的代码:
# -*- coding: utf-8 -*-
import mechanize
import os
import requests
import urllib2
from bs4 import BeautifulSoup
import cookielib
# Store the cookies and create an opener that will hold them
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
# Add our headers
opener.addheaders = [('User-agent', 'RedditTesting')]
# Install our opener (note that this changes the global opener to the one
# we just made, but you can also just call opener.open() if you want)
urllib2.install_opener(opener)
# The action/ target from the form
authentication_url = 'https://quora.com'
# Input parameters we are going to send
payload = {
'op': 'login-main',
'user': '<username>',
'passwd': '<password>'
}
# Use urllib to encode the payload
data = urllib.urlencode(payload)
# Build our Request object (supplying 'data' makes it a POST)
req = urllib2.Request(authentication_url, data)
# Make the request and read the response
resp = urllib2.urlopen(req)
contents = resp.read()
# specify the url
quote_page = "https://www.quora.com/"
# query the website and return the html to the variable ‘page’
page = urllib2.urlopen(quote_page)
# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page, 'html.parser')
# Take out the <div> of name and get its value
name_box = soup.find('div', attrs={"class": "ContentWrapper"})
name = name_box.text.strip() # strip() is used to remove starting and trailing
print name
for link in soup.find_all('img'):
image = link.get("src")
os.path.split(image)
image_name = os.path.split(image)[1]
print(image_name)
r2 = requests.get(image)
with open(image_name, "wb") as f:
f.write(r2.content)
由于我没有该站点的任何实际用户名,因此我使用自己的 Gmail 帐户。为了登录,我使用了来自不同问题的一些代码,但这不起作用。
任何缩进错误都是由于我糟糕的格式造成的。
要登录和抓取,请使用会话;使用您的凭据作为有效载荷发出 POST 请求,然后抓取。
import requests
from bs4 import BeautifulSoup
with requests.Session() as s:
p = s.post("https://quora.com", data={
"email": '*******',
"password": "*************"
})
print(p.text)
base_page = s.get('https://quora.com')
soup = BeautifulSoup(base_page.content, 'html.parser')
print(soup.title)