Python 网络抓取工具在 520 个 url 时冻结。它出什么问题了?
Python web scraper freezes at 520 urls feeded. Whats wrong with it?
- 示例 urls.csv 将被提供给网络抓取工具。
Main,Income statement,Balance sheet,Cash flows
https://www.investing.com/equities/vical-inc,https://www.investing.com/equities/vical-inc-income-statement,https://www.investing.com/equities/vical-inc-balance-sheet,https://www.investing.com/equities/vical-inc-cash-flow
- ncav.py 是真正的网络抓取工具。
from os import system, name
from time import sleep
from csv import reader, writer
from ncavfunctions import income, flashbalance
def clear():
"""Clears GUI.
"""
if name == 'nt':
_ = system('cls')
else:
_ = system('clear')
# Try to read urls.csv. country is a list of lists of strings
try:
with open('urls.csv', 'r', newline='') as csvfile:
csv_reader = reader(csvfile)
next(csv_reader) # Skip Headers
country = []
for line in csv_reader:
country.append(
[line[0],
line[1],
line[2],
line[3]])
print("0. urls.csv loaded")
except:
print("Error with urls.csv file!")
sleep(5)
# Construct country_ncav a list of tuples of strings
country_ncav = []
i = 1
for line in country:
clear()
loading_perc = i * 100 / len(country)
print("Processed {0:.2f}".format(loading_perc), "% of urls")
print("Processing...")
i = i + 1
try:
lst = \
income(line[1])\
+ flashbalance(line[2])
country_ncav.append(lst)
except:
country_ncav.append(["Unknown Error"])
# Save a csv log of country ncav items.
header = ['Name', 'Shares', 'Last price',
'Total current assets', 'Total Liabilities']
with open('flashncav.csv', 'w', newline='') as csvfile:
csv_writer = writer(csvfile)
csv_writer.writerow(header)
csv_writer.writerows(country_ncav)
print("Closing in 5 seconds")
sleep(4)
print("Enjoy!")
sleep(1)
- ncavfunctions 模块如下。
"""investing.com rejects get requests not identifying a User-Agent
1. Copy url to clipboard
2. Open Google Chrome, right click open space and click inspect
3. In Dev window click Network Tab
4. Paste url in Address Bar and press Enter, wait fully load
5. At Name window click info, on the right click Headers
6. Scroll Down to User-Agent and copy
7. Paste it between "" after "User-Agent": in var headers
8. Continue lines as needed
Parsing for investing.com
html.parser : prettify() encoding issues
lxml : prettify() encoding issues
lxml-xml : prettify() working but how grab from xml?
xml : prettify() working but how grab from xml?
html5lib : prettify() encoding isssues
Using html5, prettify() doesnt work due to encoding issues, but
i can grab the elements i want from soup element
"""
from requests import get
from bs4 import BeautifulSoup as soup
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10\
_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 \
Safari/537.36"}
def indusector(url_m):
"""Returns industry and sector from all known types of mains.
Keyword arguments:
url_m -- The url of company's main page.
"""
global inse
try:
inse = indusector_a(url_m)
except:
inse = (
"Error main page or data N/A",
"")
return inse
def indusector_a(url_m):
"""Returns industry and sector of company from type a main url.
url_m -- The url of company's main page.
"""
resp = get(url_m, headers=headers)
page = soup(resp.content, "html5lib")
cont_a = page.find("div", class_="companyProfileHeader")
cont_b = cont_a.find_all("a")
industry = cont_b[0].string
sector = cont_b[1].string
return industry, sector
def income(url_i):
"""Returns all NCAV items from all known type income statements.
Keyword arguments:
url_i -- The url of the Income statement.
"""
try:
inc = income_a(url_i)
except:
try:
inc = income_b(url_i)
except:
inc = (
"Error income statement or data n/a",
"",
"")
return inc
def income_a(url_i):
"""Returns all NCAV items from income statements type a.
Keyword arguments:
url_i -- The url of the Income statement.
"""
resp = get(url_i, headers=headers)
page = soup(resp.content, "html5lib")
# Grab secondaries from js dialog box and Diluted weighted average
# shares
cont_a = page.find_all("tbody")
cont_b = cont_a[2].find_all("tr")
cont_c = cont_b[31].find_all("td")
shares = cont_c[1].string
# Grab last price
cont_a = page.find(id="last_last")
lprice = cont_a.string
# Grab Name
cont_a = page.find("h1", class_="float_lang_base_1 relativeAttr")
namet = cont_a.string
tcut = len(namet) - 1
name = namet[0:tcut]
return name, shares, lprice
def income_b(url_i):
"""Returns all NCAV items from income statements type b.
Keyword arguments:
url_i -- The url of the Income statement.
"""
resp = get(url_i, headers=headers)
page = soup(resp.content, "html5lib")
cont_a = page.find_all("tbody")
cont_b = cont_a[1].find_all("tr") # [1]vs[2] is the difference
cont_c = cont_b[31].find_all("td") # between the 2 types.
shares = cont_c[1].string
cont_a = page.find(id="last_last")
lprice = cont_a.string
cont_a = page.find("h1", class_="float_lang_base_1 relativeAttr")
namet = cont_a.string
tcut = len(namet) - 1
name = namet[0:tcut]
return name, shares, lprice
def balance(url_b):
"""Returns all NCAV items from all known type Balance sheets.
Keyword arguments:
url_b -- The url of the Balance sheet.
"""
try:
bal = balance_a(url_b)
except:
bal = (
"Error balance sheet or data n/a",
"", "",
"", "",
"", "")
return bal
def balance_a(url_b):
"""Returns all NCAV items from Balance sheet type a.
Keyword arguments:
url_b -- The url of the Balance sheet.
"""
resp = get(url_b, headers=headers)
page = soup(resp.content, "html5lib")
# Grab bolds of js dialog box
cont_a = page.find_all(id="parentTr")
# Grab last total current assets
cont_b = cont_a[0].find_all("td")
last_tot_curr_ass = cont_b[1].string
# Grab last total liabilities
cont_b = cont_a[3].find_all("td")
last_tot_liabs = cont_b[1].string
# Grab secondaries of js dialog box
cont_a = page.find_all("tr", class_="child")
# Grab last cash
cont_b = cont_a[1].find_all("td")
last_cash = cont_b[1].string
# Grab last cash & equivalents
cont_b = cont_a[2].find_all("td")
last_casnnequins = cont_b[1].string
# Grab accounts receivables
cont_b = cont_a[5].find_all("td")
last_accreceivs = cont_b[1].string
# Grab last inventory
cont_b = cont_a[6].find_all("td")
last_invs = cont_b[1].string
# Grab last total debt
cont_b = cont_a[27].find_all("td")
last_tot_dts = (cont_b[1].string)
return (
last_tot_curr_ass, last_cash,
last_casnnequins, last_accreceivs,
last_invs, last_tot_liabs,
last_tot_dts)
def cashflow(url_c):
"""Returns opcash and capex from Statement of cash flows all types.
Keyword arguments:
url_c -- The url of the Statement of cash flows.
"""
try:
cas = cashflow_a(url_c)
except:
cas = (
"Error cash flow statement or data n/a", "",
"", "",
"", "",
"", "")
return cas
def cashflow_a(url_c):
"""Returns opcash and capex from Statement of cash flows type a.
Keyword arguments:
url_c -- The url of the Statement of cash flows.
"""
resp = get(url_c, headers=headers)
page = soup(resp.content, "html5lib")
# Grab bolds of js dialog box and incremental operating income
cont_a = page.find_all(id="parentTr")
cont_b = cont_a[0].find_all("td")
incr_opcash_4 = cont_b[1].string
incr_opcash_3 = cont_b[2].string
incr_opcash_2 = cont_b[3].string
incr_opcash_1 = cont_b[4].string
# Grab secondaries of js dialog box and incremental capital
# expenditures
cont_a = page.find_all("tr", class_="child")
cont_b = cont_a[9].find_all("td")
incr_capex_4 = cont_b[1].string
incr_capex_3 = cont_b[2].string
incr_capex_2 = cont_b[3].string
incr_capex_1 = cont_b[4].string
return (
incr_opcash_4, incr_opcash_3,
incr_opcash_2, incr_opcash_1,
incr_capex_4, incr_capex_3,
incr_capex_2, incr_capex_1)
def lastprice(url_i):
"""Returns last price from all known type income statements.
Keyword arguments:
url_i -- The url of the Income statement.
"""
try:
lprice = lastprice_a(url_i)
except:
try:
lprice = lastprice_b(url_i)
except:
lprice = ("iError", "iError")
return lprice
def lastprice_a(url_i):
"""Returns last price from income statements type a.
Keyword arguments:
url_i -- The url of the Income statement.
"""
resp = get(url_i, headers=headers)
page = soup(resp.content, "html5lib")
# Grab last price
cont_a = page.find(id="last_last")
lprice = cont_a.string
return lprice
def flashbalance(url_b):
"""Returns all NCAV items from all known type Balance sheets.
Keyword arguments:
url_b -- The url of the Balance sheet.
"""
try:
flashbal = flashbalance_a(url_b)
except:
flashbal = ("Error balance sheet or data n/a", "")
return flashbal
def flashbalance_a(url_b):
"""Returns all NCAV items from Balance sheet type a.
Keyword arguments:
url_b -- The url of the Balance sheet.
"""
resp = get(url_b, headers=headers)
page = soup(resp.content, "html5lib")
# Grab bolds of js dialog box
cont_a = page.find_all(id="parentTr")
# Grab last total current assets
cont_b = cont_a[0].find_all("td")
last_tot_curr_ass = cont_b[1].string
# Grab last total liabilities
cont_b = cont_a[3].find_all("td")
last_tot_liabs = cont_b[1].string
return (last_tot_curr_ass, last_tot_liabs)
- 示例输出 flashncav.csv
Name,Shares,Last price,Total current assets,Total Liabilities
Vical Inc (BBI),2.94,1.4700,30.73,13.14
- 问题是如果 csv 文件包含超过 130 行,ncav.py 冻结。
目前的解决办法是手动打断urls.csv(有时2000家公司)
到 130 行的组。有更好的主意吗?提前致谢。
我认为您的记忆力超载了。作为第一步,我会使用类似 top in linux 的命令来监视内存的运行情况,而脚本是 运行。
如果 运行 内存不足是问题所在,可以做得更好的一件事是不要将结果追加到 country_ncav 列表中,而是直接将它们写入文件,通过逐行追加线。
作为一般建议,尝试使用现有的抓取库或框架,如 scrapy,它将使它更容易、性能更好,除非这是创建您自己的抓取器的任务。
下面的代码在上面的概念解决方案之后实现了这个目的。
新 ncav.py:
from time import sleep
from csv import reader, writer
from ncavfunctions import income, flashbalance
# Try to read urls.csv. country is a list of lists of strings.
try:
with open('urls.csv', 'r', newline='') as csvfile:
csv_reader = reader(csvfile)
next(csv_reader) # Skip Headers
country = []
for line in csv_reader:
country.append(
[line[0],
line[1],
line[2],
line[3]])
print("File urls.csv loaded.")
except:
print("Error loading urls.csv file!")
sleep(5)
# Initiate csv log.
try:
header = ['Name', 'Shares', 'Last price',
'Total current assets', 'Total Liabilities']
with open('ncav.csv', 'w', newline='') as csvfile:
csv_writer = writer(csvfile)
csv_writer.writerow(header)
print("File ncav.csv succesfully initiated.")
except:
print("Error initiating ncav.csv")
# Update csv log.
i = 1
for line in country:
loading_perc = i * 100 / len(country)
print("Processed {0:.2f}".format(loading_perc), "% of urls")
print("Processing...")
i = i + 1
try:
lst = \
income(line[1])\
+ flashbalance(line[2])
with open('ncav.csv', 'a', newline='') as csvfile:
csv_writer = writer(csvfile)
csv_writer.writerow(lst)
print("Company info saved.")
except:
print("Error saving company info.")
print("Quiting in 5 seconds")
sleep(4)
print("Enjoy!")
sleep(1)
- 示例 urls.csv 将被提供给网络抓取工具。
Main,Income statement,Balance sheet,Cash flows
https://www.investing.com/equities/vical-inc,https://www.investing.com/equities/vical-inc-income-statement,https://www.investing.com/equities/vical-inc-balance-sheet,https://www.investing.com/equities/vical-inc-cash-flow
- ncav.py 是真正的网络抓取工具。
from os import system, name
from time import sleep
from csv import reader, writer
from ncavfunctions import income, flashbalance
def clear():
"""Clears GUI.
"""
if name == 'nt':
_ = system('cls')
else:
_ = system('clear')
# Try to read urls.csv. country is a list of lists of strings
try:
with open('urls.csv', 'r', newline='') as csvfile:
csv_reader = reader(csvfile)
next(csv_reader) # Skip Headers
country = []
for line in csv_reader:
country.append(
[line[0],
line[1],
line[2],
line[3]])
print("0. urls.csv loaded")
except:
print("Error with urls.csv file!")
sleep(5)
# Construct country_ncav a list of tuples of strings
country_ncav = []
i = 1
for line in country:
clear()
loading_perc = i * 100 / len(country)
print("Processed {0:.2f}".format(loading_perc), "% of urls")
print("Processing...")
i = i + 1
try:
lst = \
income(line[1])\
+ flashbalance(line[2])
country_ncav.append(lst)
except:
country_ncav.append(["Unknown Error"])
# Save a csv log of country ncav items.
header = ['Name', 'Shares', 'Last price',
'Total current assets', 'Total Liabilities']
with open('flashncav.csv', 'w', newline='') as csvfile:
csv_writer = writer(csvfile)
csv_writer.writerow(header)
csv_writer.writerows(country_ncav)
print("Closing in 5 seconds")
sleep(4)
print("Enjoy!")
sleep(1)
- ncavfunctions 模块如下。
"""investing.com rejects get requests not identifying a User-Agent
1. Copy url to clipboard
2. Open Google Chrome, right click open space and click inspect
3. In Dev window click Network Tab
4. Paste url in Address Bar and press Enter, wait fully load
5. At Name window click info, on the right click Headers
6. Scroll Down to User-Agent and copy
7. Paste it between "" after "User-Agent": in var headers
8. Continue lines as needed
Parsing for investing.com
html.parser : prettify() encoding issues
lxml : prettify() encoding issues
lxml-xml : prettify() working but how grab from xml?
xml : prettify() working but how grab from xml?
html5lib : prettify() encoding isssues
Using html5, prettify() doesnt work due to encoding issues, but
i can grab the elements i want from soup element
"""
from requests import get
from bs4 import BeautifulSoup as soup
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10\
_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 \
Safari/537.36"}
def indusector(url_m):
"""Returns industry and sector from all known types of mains.
Keyword arguments:
url_m -- The url of company's main page.
"""
global inse
try:
inse = indusector_a(url_m)
except:
inse = (
"Error main page or data N/A",
"")
return inse
def indusector_a(url_m):
"""Returns industry and sector of company from type a main url.
url_m -- The url of company's main page.
"""
resp = get(url_m, headers=headers)
page = soup(resp.content, "html5lib")
cont_a = page.find("div", class_="companyProfileHeader")
cont_b = cont_a.find_all("a")
industry = cont_b[0].string
sector = cont_b[1].string
return industry, sector
def income(url_i):
"""Returns all NCAV items from all known type income statements.
Keyword arguments:
url_i -- The url of the Income statement.
"""
try:
inc = income_a(url_i)
except:
try:
inc = income_b(url_i)
except:
inc = (
"Error income statement or data n/a",
"",
"")
return inc
def income_a(url_i):
"""Returns all NCAV items from income statements type a.
Keyword arguments:
url_i -- The url of the Income statement.
"""
resp = get(url_i, headers=headers)
page = soup(resp.content, "html5lib")
# Grab secondaries from js dialog box and Diluted weighted average
# shares
cont_a = page.find_all("tbody")
cont_b = cont_a[2].find_all("tr")
cont_c = cont_b[31].find_all("td")
shares = cont_c[1].string
# Grab last price
cont_a = page.find(id="last_last")
lprice = cont_a.string
# Grab Name
cont_a = page.find("h1", class_="float_lang_base_1 relativeAttr")
namet = cont_a.string
tcut = len(namet) - 1
name = namet[0:tcut]
return name, shares, lprice
def income_b(url_i):
"""Returns all NCAV items from income statements type b.
Keyword arguments:
url_i -- The url of the Income statement.
"""
resp = get(url_i, headers=headers)
page = soup(resp.content, "html5lib")
cont_a = page.find_all("tbody")
cont_b = cont_a[1].find_all("tr") # [1]vs[2] is the difference
cont_c = cont_b[31].find_all("td") # between the 2 types.
shares = cont_c[1].string
cont_a = page.find(id="last_last")
lprice = cont_a.string
cont_a = page.find("h1", class_="float_lang_base_1 relativeAttr")
namet = cont_a.string
tcut = len(namet) - 1
name = namet[0:tcut]
return name, shares, lprice
def balance(url_b):
"""Returns all NCAV items from all known type Balance sheets.
Keyword arguments:
url_b -- The url of the Balance sheet.
"""
try:
bal = balance_a(url_b)
except:
bal = (
"Error balance sheet or data n/a",
"", "",
"", "",
"", "")
return bal
def balance_a(url_b):
"""Returns all NCAV items from Balance sheet type a.
Keyword arguments:
url_b -- The url of the Balance sheet.
"""
resp = get(url_b, headers=headers)
page = soup(resp.content, "html5lib")
# Grab bolds of js dialog box
cont_a = page.find_all(id="parentTr")
# Grab last total current assets
cont_b = cont_a[0].find_all("td")
last_tot_curr_ass = cont_b[1].string
# Grab last total liabilities
cont_b = cont_a[3].find_all("td")
last_tot_liabs = cont_b[1].string
# Grab secondaries of js dialog box
cont_a = page.find_all("tr", class_="child")
# Grab last cash
cont_b = cont_a[1].find_all("td")
last_cash = cont_b[1].string
# Grab last cash & equivalents
cont_b = cont_a[2].find_all("td")
last_casnnequins = cont_b[1].string
# Grab accounts receivables
cont_b = cont_a[5].find_all("td")
last_accreceivs = cont_b[1].string
# Grab last inventory
cont_b = cont_a[6].find_all("td")
last_invs = cont_b[1].string
# Grab last total debt
cont_b = cont_a[27].find_all("td")
last_tot_dts = (cont_b[1].string)
return (
last_tot_curr_ass, last_cash,
last_casnnequins, last_accreceivs,
last_invs, last_tot_liabs,
last_tot_dts)
def cashflow(url_c):
"""Returns opcash and capex from Statement of cash flows all types.
Keyword arguments:
url_c -- The url of the Statement of cash flows.
"""
try:
cas = cashflow_a(url_c)
except:
cas = (
"Error cash flow statement or data n/a", "",
"", "",
"", "",
"", "")
return cas
def cashflow_a(url_c):
"""Returns opcash and capex from Statement of cash flows type a.
Keyword arguments:
url_c -- The url of the Statement of cash flows.
"""
resp = get(url_c, headers=headers)
page = soup(resp.content, "html5lib")
# Grab bolds of js dialog box and incremental operating income
cont_a = page.find_all(id="parentTr")
cont_b = cont_a[0].find_all("td")
incr_opcash_4 = cont_b[1].string
incr_opcash_3 = cont_b[2].string
incr_opcash_2 = cont_b[3].string
incr_opcash_1 = cont_b[4].string
# Grab secondaries of js dialog box and incremental capital
# expenditures
cont_a = page.find_all("tr", class_="child")
cont_b = cont_a[9].find_all("td")
incr_capex_4 = cont_b[1].string
incr_capex_3 = cont_b[2].string
incr_capex_2 = cont_b[3].string
incr_capex_1 = cont_b[4].string
return (
incr_opcash_4, incr_opcash_3,
incr_opcash_2, incr_opcash_1,
incr_capex_4, incr_capex_3,
incr_capex_2, incr_capex_1)
def lastprice(url_i):
"""Returns last price from all known type income statements.
Keyword arguments:
url_i -- The url of the Income statement.
"""
try:
lprice = lastprice_a(url_i)
except:
try:
lprice = lastprice_b(url_i)
except:
lprice = ("iError", "iError")
return lprice
def lastprice_a(url_i):
"""Returns last price from income statements type a.
Keyword arguments:
url_i -- The url of the Income statement.
"""
resp = get(url_i, headers=headers)
page = soup(resp.content, "html5lib")
# Grab last price
cont_a = page.find(id="last_last")
lprice = cont_a.string
return lprice
def flashbalance(url_b):
"""Returns all NCAV items from all known type Balance sheets.
Keyword arguments:
url_b -- The url of the Balance sheet.
"""
try:
flashbal = flashbalance_a(url_b)
except:
flashbal = ("Error balance sheet or data n/a", "")
return flashbal
def flashbalance_a(url_b):
"""Returns all NCAV items from Balance sheet type a.
Keyword arguments:
url_b -- The url of the Balance sheet.
"""
resp = get(url_b, headers=headers)
page = soup(resp.content, "html5lib")
# Grab bolds of js dialog box
cont_a = page.find_all(id="parentTr")
# Grab last total current assets
cont_b = cont_a[0].find_all("td")
last_tot_curr_ass = cont_b[1].string
# Grab last total liabilities
cont_b = cont_a[3].find_all("td")
last_tot_liabs = cont_b[1].string
return (last_tot_curr_ass, last_tot_liabs)
- 示例输出 flashncav.csv
Name,Shares,Last price,Total current assets,Total Liabilities
Vical Inc (BBI),2.94,1.4700,30.73,13.14
- 问题是如果 csv 文件包含超过 130 行,ncav.py 冻结。
目前的解决办法是手动打断urls.csv(有时2000家公司)
到 130 行的组。有更好的主意吗?提前致谢。
我认为您的记忆力超载了。作为第一步,我会使用类似 top in linux 的命令来监视内存的运行情况,而脚本是 运行。 如果 运行 内存不足是问题所在,可以做得更好的一件事是不要将结果追加到 country_ncav 列表中,而是直接将它们写入文件,通过逐行追加线。
作为一般建议,尝试使用现有的抓取库或框架,如 scrapy,它将使它更容易、性能更好,除非这是创建您自己的抓取器的任务。
下面的代码在上面的概念解决方案之后实现了这个目的。
新 ncav.py:
from time import sleep
from csv import reader, writer
from ncavfunctions import income, flashbalance
# Try to read urls.csv. country is a list of lists of strings.
try:
with open('urls.csv', 'r', newline='') as csvfile:
csv_reader = reader(csvfile)
next(csv_reader) # Skip Headers
country = []
for line in csv_reader:
country.append(
[line[0],
line[1],
line[2],
line[3]])
print("File urls.csv loaded.")
except:
print("Error loading urls.csv file!")
sleep(5)
# Initiate csv log.
try:
header = ['Name', 'Shares', 'Last price',
'Total current assets', 'Total Liabilities']
with open('ncav.csv', 'w', newline='') as csvfile:
csv_writer = writer(csvfile)
csv_writer.writerow(header)
print("File ncav.csv succesfully initiated.")
except:
print("Error initiating ncav.csv")
# Update csv log.
i = 1
for line in country:
loading_perc = i * 100 / len(country)
print("Processed {0:.2f}".format(loading_perc), "% of urls")
print("Processing...")
i = i + 1
try:
lst = \
income(line[1])\
+ flashbalance(line[2])
with open('ncav.csv', 'a', newline='') as csvfile:
csv_writer = writer(csvfile)
csv_writer.writerow(lst)
print("Company info saved.")
except:
print("Error saving company info.")
print("Quiting in 5 seconds")
sleep(4)
print("Enjoy!")
sleep(1)