在 python 3.9.10 上使用 beautifulsoup 解析 instagram 的 html 登录页面
parsing instagram's html of log-in page with beautifulsoup on python 3.9.10
基本上我正在尝试构建一个可以通过 url 识别登录页面的程序。
我这样做的想法是通过页面解析来搜索文本框(而不是通过名称和类型来识别它们)。这是代码:
import requests
from bs4 import BeautifulSoup
\parse page html (soup)
def parse(soup):
found = []
for a in soup.find_all('input'):
if(a['type'] in ['text','password','email']):
found.append(a['name'])
return found
\get site's html
def get_site_content(url):
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html5lib')
textBoxes = parse(soup)
print("Found in: " +url)
print(textBoxes)
if __name__ == '__main__':
get_site_content('https://login.facebook.com')
get_site_content('https://www.instagram.com/accounts/login/')
get_site_content('https://instagram.com')
get_site_content('https://instagram.com/login')
get_site_content('https://login.yahoo.com')
似乎工作正常,但出于某种原因,我在 instagram 的登录页面上遇到了问题。这是输出:
Found in: https://login.facebook.com
['email', 'pass']
Found in: https://www.instagram.com/accounts/login/
[]
Found in: https://instagram.com
[]
Found in: https://instagram.com/login
[]
Found in: https://login.yahoo.com
['username', 'passwd']
Process finished with exit code 0
在使用不同的库获取 html 和不同的解析器之后,我开始明白问题出在 html = requests.get(url)
行上。它只是没有得到完整的 html。
有想法该怎么解决这个吗?
提前致谢!
顺便说一下,如果您对我想要完成的事情有更好的想法,我很乐意听到它:)
JavaScript
动态提供的内容不会由 requests
呈现。要获得呈现的 page_source
使用 selenium
.
您还可以 select 您的元素更具体:
for a in soup.select('input[name]'):
例子
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
def parse(soup):
found = []
for a in soup.select('input[name]'):
if(a['type'] in ['text','password','email']):
found.append(a['name'])
return found
def get_site_content(url):
driver.get(url)
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html5lib')
textBoxes = parse(soup)
print("Found in: " +url)
print(textBoxes)
if __name__ == '__main__':
get_site_content('https://login.facebook.com')
get_site_content('https://www.instagram.com/accounts/login/')
get_site_content('https://instagram.com')
get_site_content('https://instagram.com/login')
get_site_content('https://login.yahoo.com')
输出
Found in: https://login.facebook.com
['email', 'pass']
Found in: https://www.instagram.com/accounts/login/
['username', 'password']
Found in: https://instagram.com
['username', 'password']
Found in: https://instagram.com/login
['username', 'password']
Found in: https://login.yahoo.com
['username', 'passwd']
好吧,感谢@user:14460824 (HedgHog),我开始意识到问题是需要渲染页面,因为它是从 Javascript 动态渲染的。就个人而言,我不喜欢 selenium,而是使用 requests-html。它的操作与 selenium 相同,但感觉更容易使用,将来当我意识到如何识别网页是否从 Javascript 动态呈现时,这个库将更容易使用,所以我不会浪费资源。
这是代码:
from requests_html import HTMLSession
import requests
#parse page html
def parse(html):
found = []
for a in html.find('input'):
if(a.attrs['type'] in ['text','password','email'] and 'name' in a.attrs):
found.append(a.attrs['name'])
return found
#get site's html
def get_site_content(url):
try:
session = HTMLSession()
response = session.get(url)
#if(JAVASCRIPT): #here i need to find a way to tell weather
#Render the page #the page is rendered dynamically from Javascript
#response.html.render(timeout=20)
response.html.render(timeout=20) #for now render all pages
return response.html
except requests.exceptions.RequestException as e:
print(e)
def find_textboxes(url):
textBoxes = parse(get_site_content(url))
print("Found in: " +url)
print(textBoxes)
if __name__ == '__main__':
find_textboxes('https://login.facebook.com')
find_textboxes('https://www.instagram.com/accounts/login/')
find_textboxes('https://instagram.com')
find_textboxes('https://login.yahoo.com')
基本上我正在尝试构建一个可以通过 url 识别登录页面的程序。 我这样做的想法是通过页面解析来搜索文本框(而不是通过名称和类型来识别它们)。这是代码:
import requests
from bs4 import BeautifulSoup
\parse page html (soup)
def parse(soup):
found = []
for a in soup.find_all('input'):
if(a['type'] in ['text','password','email']):
found.append(a['name'])
return found
\get site's html
def get_site_content(url):
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html5lib')
textBoxes = parse(soup)
print("Found in: " +url)
print(textBoxes)
if __name__ == '__main__':
get_site_content('https://login.facebook.com')
get_site_content('https://www.instagram.com/accounts/login/')
get_site_content('https://instagram.com')
get_site_content('https://instagram.com/login')
get_site_content('https://login.yahoo.com')
似乎工作正常,但出于某种原因,我在 instagram 的登录页面上遇到了问题。这是输出:
Found in: https://login.facebook.com
['email', 'pass']
Found in: https://www.instagram.com/accounts/login/
[]
Found in: https://instagram.com
[]
Found in: https://instagram.com/login
[]
Found in: https://login.yahoo.com
['username', 'passwd']
Process finished with exit code 0
在使用不同的库获取 html 和不同的解析器之后,我开始明白问题出在 html = requests.get(url)
行上。它只是没有得到完整的 html。
有想法该怎么解决这个吗?
提前致谢!
顺便说一下,如果您对我想要完成的事情有更好的想法,我很乐意听到它:)
JavaScript
动态提供的内容不会由 requests
呈现。要获得呈现的 page_source
使用 selenium
.
您还可以 select 您的元素更具体:
for a in soup.select('input[name]'):
例子
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
def parse(soup):
found = []
for a in soup.select('input[name]'):
if(a['type'] in ['text','password','email']):
found.append(a['name'])
return found
def get_site_content(url):
driver.get(url)
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html5lib')
textBoxes = parse(soup)
print("Found in: " +url)
print(textBoxes)
if __name__ == '__main__':
get_site_content('https://login.facebook.com')
get_site_content('https://www.instagram.com/accounts/login/')
get_site_content('https://instagram.com')
get_site_content('https://instagram.com/login')
get_site_content('https://login.yahoo.com')
输出
Found in: https://login.facebook.com
['email', 'pass']
Found in: https://www.instagram.com/accounts/login/
['username', 'password']
Found in: https://instagram.com
['username', 'password']
Found in: https://instagram.com/login
['username', 'password']
Found in: https://login.yahoo.com
['username', 'passwd']
好吧,感谢@user:14460824 (HedgHog),我开始意识到问题是需要渲染页面,因为它是从 Javascript 动态渲染的。就个人而言,我不喜欢 selenium,而是使用 requests-html。它的操作与 selenium 相同,但感觉更容易使用,将来当我意识到如何识别网页是否从 Javascript 动态呈现时,这个库将更容易使用,所以我不会浪费资源。 这是代码:
from requests_html import HTMLSession
import requests
#parse page html
def parse(html):
found = []
for a in html.find('input'):
if(a.attrs['type'] in ['text','password','email'] and 'name' in a.attrs):
found.append(a.attrs['name'])
return found
#get site's html
def get_site_content(url):
try:
session = HTMLSession()
response = session.get(url)
#if(JAVASCRIPT): #here i need to find a way to tell weather
#Render the page #the page is rendered dynamically from Javascript
#response.html.render(timeout=20)
response.html.render(timeout=20) #for now render all pages
return response.html
except requests.exceptions.RequestException as e:
print(e)
def find_textboxes(url):
textBoxes = parse(get_site_content(url))
print("Found in: " +url)
print(textBoxes)
if __name__ == '__main__':
find_textboxes('https://login.facebook.com')
find_textboxes('https://www.instagram.com/accounts/login/')
find_textboxes('https://instagram.com')
find_textboxes('https://login.yahoo.com')