如何从文本文件中一个一个地抓取公司名称,我的代码只抓取列表中的最后一个公司名称
How to scrape company names from a text file one by one, my code only scrapes the last company name on the list
我正在尝试从公司数据库中抓取公司信息。我在一个文本文件中有一个公司列表,我希望 selenium 将其输入到网站的搜索中并逐一抓取所需的信息。
我的问题是,出于某种原因,它只输入了列表中的姓氏。我如何才能告诉 python 抓取列表中的第一个公司名称,然后是下一个,依此类推?
我的代码如下:
# -*- coding: utf-8 -*-
# from typing_extensions import Self
from lib2to3.pgen2 import driver
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which
count = 0
file = open ("cegek.txt", "r")
lines = file.readlines()
for line in lines:
count += 1
# # cegek = "1000 Út Kft."
class HtSpiderSeleniumceg(scrapy.Spider):
name = 'kamara'
allowed_domains = ["wwww.ceginfo.hu/"]
start_urls = [
'https://www.ceginfo.hu'
]
def __init__(self):
chrome_options = Options()
# chrome_options.add_argument("--headless")
#get login page
driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
driver.get("https://www.ceginfo.hu/")
driver.find_element_by_xpath("//input[@type='search']").send_keys(line)
sleep(2)
driver.find_element_by_xpath("//input[@type='search']").send_keys(u'\ue007')
self.html = driver.page_source
driver.close()
#scrape needed info
def parse(self, response):
resp = Selector(text=self.html)
for ceg in resp.xpath("(//div[contains(@class, 'd-flex flex-column flex-sm-row justify-content-between align-items-center')])[1]"):
yield {
'cegnev': ceg.xpath("(//h2[contains(@class,'s-title heading')])[1]/text()").get(),
'adoszam': ceg.xpath("(.//span[@class='text-uppercase c-border me-lg-3'])[1]/text()").get(),
'cegjegy': ceg.xpath("(.//span[@class='c-border'])[1]/text()").get()
}
这是公司名称列表的确切格式:
SZIMIKRON Ipari Kft.
Tigra Computer- és Irodatechnikai Kft.
Tradeland Kft.
Török László EV Török Kulcsszervíz
Tungsram Operations Kft.
Tutti Élelmiszeripari Kft.
Water and Soil Kft.
Webkey Development Kft.
ZDMnet
在一些帮助下,现在搜索了列表中的第一个名字,但是由于错误,蜘蛛没有抓取和中断:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=100.0.4896.60)
这是我的新代码,底部的 #out 部分显示了我的新问题的假定解决方案,但我不知道如何实现它,我尝试将它放在不同的地方但没有用。另外,我不确定 'your_element' 部分指的是什么,这个解决方案是在这个线程中提出的:
StaleElementReferenceException on Python Selenium
# -*- coding: utf-8 -*-
# from typing_extensions import Self
from lib2to3.pgen2 import driver
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
# # cegek = "1000 Út Kft."
class HtSpiderSeleniumceg(scrapy.Spider):
name = 'kamara'
allowed_domains = ["wwww.ceginfo.hu/"]
start_urls = [
'https://www.ceginfo.hu'
]
global names_to_search
names_to_search = open("cegek.txt", "r").readlines()
def __init__(self):
chrome_options = Options()
# chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
self.driver.get("https://ceginfo.hu/ceglista/cegek")
sleep(2)
self.start_urls = [self.driver.current_url]
sleep(2)
global this_driver
this_driver = self.driver.find_element_by_xpath("//input[@type='search']")
this_driver.send_keys(names_to_search[0])
sleep(2)
this_driver.send_keys(u'\ue007')
sleep(5)
def parse(self, response):
self.driver.get(response.url)
print('this_driver')
print(this_driver)
print('names_to_search')
print(names_to_search)
for names in names_to_search:
print('searching this names:')
print(names)
resp = Selector(text=self.driver.page_source)
sleep(5)
for ceg in resp.xpath("(//p[@class='mb-3 m-sm-0 meta d-flex flex-column flex-lg-row me-auto'])[1]"):
yield {
'cegnev': ceg.xpath("(//h2[contains(@class,'s-title heading')])[1]/text()").get(),
'adoszam': ceg.xpath("(.//span[@class='text-uppercase c-border me-lg-3'])[1]/text()").get(),
'cegjegy': ceg.xpath("(.//span[@class='c-border'])[1]/text()").get()
}
try:
print(this_driver)
this_driver.send_keys(names)
# driver.find_element_by_xpath("//input[@type='search']").send_keys(line)
sleep(2)
this_driver.send_keys(u'\ue007')
except:
print('exception - do not break')
self.driver.close()
# my_element_id = "(//p[@class='mb-3 m-sm-0 meta d-flex flex-column flex-lg-row me-auto'])[1]"
# ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,)
# your_element = WebDriverWait(self.driver, 20,ignored_exceptions=ignored_exceptions)\
# .until(expected_conditions.presence_of_element_located((By.XPATH, my_element_id)))
如果不安装 Selenium、网络驱动程序等,我无法完全复制您的代码,但这就是您实现解决方案的方式。
编写函数以从 cegek.txt 中读取名称并将其附加到列表中:
names_to_search = []
def get_names_to_search():
# open file to read
file = open ("cegek.txt", "r")
# read lines in file
lines = file.readlines()
# loop through file and append names to list
for line in lines:
names_to_search.append(line.strip())
# The names_to_search list will contain:
['SZIMIKRON Ipari Kft.', 'Tigra Computer- és Irodatechnikai Kft.', 'Tradeland Kft.', 'Török László EV Török Kulcsszervíz', 'Tungsram Operations Kft.', 'Tutti Élelmiszeripari Kft.', 'Water and Soil Kft.', 'Webkey Development Kft.', 'ZDMnet']
遍历 names_to_search
并将每个名称传递给 driver.find_element_by_xpath("//input[@type='search']").send_keys(name)
for name in names_to_search:
driver.find_element_by_xpath("//input[@type='search']").send_keys(name)
我正在尝试从公司数据库中抓取公司信息。我在一个文本文件中有一个公司列表,我希望 selenium 将其输入到网站的搜索中并逐一抓取所需的信息。
我的问题是,出于某种原因,它只输入了列表中的姓氏。我如何才能告诉 python 抓取列表中的第一个公司名称,然后是下一个,依此类推?
我的代码如下:
# -*- coding: utf-8 -*-
# from typing_extensions import Self
from lib2to3.pgen2 import driver
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which
count = 0
file = open ("cegek.txt", "r")
lines = file.readlines()
for line in lines:
count += 1
# # cegek = "1000 Út Kft."
class HtSpiderSeleniumceg(scrapy.Spider):
name = 'kamara'
allowed_domains = ["wwww.ceginfo.hu/"]
start_urls = [
'https://www.ceginfo.hu'
]
def __init__(self):
chrome_options = Options()
# chrome_options.add_argument("--headless")
#get login page
driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
driver.get("https://www.ceginfo.hu/")
driver.find_element_by_xpath("//input[@type='search']").send_keys(line)
sleep(2)
driver.find_element_by_xpath("//input[@type='search']").send_keys(u'\ue007')
self.html = driver.page_source
driver.close()
#scrape needed info
def parse(self, response):
resp = Selector(text=self.html)
for ceg in resp.xpath("(//div[contains(@class, 'd-flex flex-column flex-sm-row justify-content-between align-items-center')])[1]"):
yield {
'cegnev': ceg.xpath("(//h2[contains(@class,'s-title heading')])[1]/text()").get(),
'adoszam': ceg.xpath("(.//span[@class='text-uppercase c-border me-lg-3'])[1]/text()").get(),
'cegjegy': ceg.xpath("(.//span[@class='c-border'])[1]/text()").get()
}
这是公司名称列表的确切格式:
SZIMIKRON Ipari Kft.
Tigra Computer- és Irodatechnikai Kft.
Tradeland Kft.
Török László EV Török Kulcsszervíz
Tungsram Operations Kft.
Tutti Élelmiszeripari Kft.
Water and Soil Kft.
Webkey Development Kft.
ZDMnet
在一些帮助下,现在搜索了列表中的第一个名字,但是由于错误,蜘蛛没有抓取和中断:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=100.0.4896.60)
这是我的新代码,底部的 #out 部分显示了我的新问题的假定解决方案,但我不知道如何实现它,我尝试将它放在不同的地方但没有用。另外,我不确定 'your_element' 部分指的是什么,这个解决方案是在这个线程中提出的: StaleElementReferenceException on Python Selenium
# -*- coding: utf-8 -*-
# from typing_extensions import Self
from lib2to3.pgen2 import driver
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
# # cegek = "1000 Út Kft."
class HtSpiderSeleniumceg(scrapy.Spider):
name = 'kamara'
allowed_domains = ["wwww.ceginfo.hu/"]
start_urls = [
'https://www.ceginfo.hu'
]
global names_to_search
names_to_search = open("cegek.txt", "r").readlines()
def __init__(self):
chrome_options = Options()
# chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
self.driver.get("https://ceginfo.hu/ceglista/cegek")
sleep(2)
self.start_urls = [self.driver.current_url]
sleep(2)
global this_driver
this_driver = self.driver.find_element_by_xpath("//input[@type='search']")
this_driver.send_keys(names_to_search[0])
sleep(2)
this_driver.send_keys(u'\ue007')
sleep(5)
def parse(self, response):
self.driver.get(response.url)
print('this_driver')
print(this_driver)
print('names_to_search')
print(names_to_search)
for names in names_to_search:
print('searching this names:')
print(names)
resp = Selector(text=self.driver.page_source)
sleep(5)
for ceg in resp.xpath("(//p[@class='mb-3 m-sm-0 meta d-flex flex-column flex-lg-row me-auto'])[1]"):
yield {
'cegnev': ceg.xpath("(//h2[contains(@class,'s-title heading')])[1]/text()").get(),
'adoszam': ceg.xpath("(.//span[@class='text-uppercase c-border me-lg-3'])[1]/text()").get(),
'cegjegy': ceg.xpath("(.//span[@class='c-border'])[1]/text()").get()
}
try:
print(this_driver)
this_driver.send_keys(names)
# driver.find_element_by_xpath("//input[@type='search']").send_keys(line)
sleep(2)
this_driver.send_keys(u'\ue007')
except:
print('exception - do not break')
self.driver.close()
# my_element_id = "(//p[@class='mb-3 m-sm-0 meta d-flex flex-column flex-lg-row me-auto'])[1]"
# ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,)
# your_element = WebDriverWait(self.driver, 20,ignored_exceptions=ignored_exceptions)\
# .until(expected_conditions.presence_of_element_located((By.XPATH, my_element_id)))
如果不安装 Selenium、网络驱动程序等,我无法完全复制您的代码,但这就是您实现解决方案的方式。
编写函数以从 cegek.txt 中读取名称并将其附加到列表中:
names_to_search = []
def get_names_to_search():
# open file to read
file = open ("cegek.txt", "r")
# read lines in file
lines = file.readlines()
# loop through file and append names to list
for line in lines:
names_to_search.append(line.strip())
# The names_to_search list will contain:
['SZIMIKRON Ipari Kft.', 'Tigra Computer- és Irodatechnikai Kft.', 'Tradeland Kft.', 'Török László EV Török Kulcsszervíz', 'Tungsram Operations Kft.', 'Tutti Élelmiszeripari Kft.', 'Water and Soil Kft.', 'Webkey Development Kft.', 'ZDMnet']
遍历 names_to_search
并将每个名称传递给 driver.find_element_by_xpath("//input[@type='search']").send_keys(name)
for name in names_to_search:
driver.find_element_by_xpath("//input[@type='search']").send_keys(name)