Lambda 执行时间问题
Issue in Lambda execution time
我正在做一个项目,我必须在有限的时间内抓取最大 URL(放置在 S3 存储桶的文件中)并将它们存储在可搜索的数据库中。现在我在 aws lambda 中抓取网页时遇到问题。我的任务有一个函数,当在 google Collab 环境中运行时,只需 7-8 秒即可执行并产生所需的结果。但是,部署为 lambda 时,相同的函数执行时间要多出近 10 倍。这是我的代码:
import requests
import re
import validators
import boto3
from smart_open import open
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
nltk.data.path.append("/tmp")
nltk.download("stopwords", download_dir = "/tmp")
def CrawlingLambda(event, context):
"""
This lambda crawls a list of webpages, reading URLS from S3 bucket and returns a dictionary
pairing each URL with its keywords.
Args:
http: A pckage inside PoolManager() able to send GET requests
web_url: url of the website whose availability is required
Returns:
bool: Depending upon the response of GET request, this function will return a bool indicating availability of web_url
"""
results = {}
client = boto3.client('s3')
for line in open('s3://urls-to-monitor/URLs1T.txt', transport_params={'client': client}):
if line[len(line)-1] != '/':
url = line[:len(line)-2]
else: url = line
if validation(url) == False:
continue
try:
web_content = scrape_web(url)
results[url] = web_content
except:
continue
return results
def validation(url):
"""
Validates the URL's string. This method use regular expressions for validation at backend.
Args:
url: URL to validate
Returns:
bool: True if the passes string is a valid URL and False otherwise.
"""
return validators.url(url)
def scrape_web(url):
"""
This function scrapes a given URL's web page for a specific set of keywords.
Args:
url: Page's URL to be scraped
Return:
filtered_words: A refined list of extracted words from the web page.
"""
try:
res = requests.get(url, timeout=2)
except:
raise ValueError
if res.status_code != 200:
raise ValueError
html_page = res.content
soup = remove_tags(html_page)
content = soup.get_text()
words = re.split(r"\s+|/", content.lower())
filtered_words = clean_wordlist(words)
return tuple(filtered_words)
def remove_tags(html):
"""
Remove the specified tags from HTML response recieved from request.get() method.
Args:
html: HTML response of the web page
Returns:
soup: Parsed response of HTML
"""
# parse html content
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script', 'noscript']):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return soup
def clean_wordlist(wordlist):
"""
This function removes any punctuation marks and stop words from our extracted wordlist.
Args:
wordlist: A list of raw words extracted from html response of web page.
Returns:
key_words: A filtered list of words containing only key words
"""
words_without_symbol = []
for word in wordlist:
#Symbols to ignore
symbols = "!@#$%^&*()_-+={[}]|\;:\"<>?/., "
for i in range(len(symbols)):
word = word.replace(symbols[i], '')
if len(word) > 0:
words_without_symbol.append(word)
#ignoring the stopwords
key_words = [word for word in words_without_symbol if not word in stopwords.words()]
return key_words
请教各位大神,请问为什么时差这么大,我该如何缩小。
唯一可以影响性能的配置是内存分配。尝试增加为您的函数分配的内存,直到您的性能至少与 Collab 相同。
计费应该不会有太大影响,因为它是根据内存和执行时间的乘积计算的。
我正在做一个项目,我必须在有限的时间内抓取最大 URL(放置在 S3 存储桶的文件中)并将它们存储在可搜索的数据库中。现在我在 aws lambda 中抓取网页时遇到问题。我的任务有一个函数,当在 google Collab 环境中运行时,只需 7-8 秒即可执行并产生所需的结果。但是,部署为 lambda 时,相同的函数执行时间要多出近 10 倍。这是我的代码:
import requests
import re
import validators
import boto3
from smart_open import open
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
nltk.data.path.append("/tmp")
nltk.download("stopwords", download_dir = "/tmp")
def CrawlingLambda(event, context):
"""
This lambda crawls a list of webpages, reading URLS from S3 bucket and returns a dictionary
pairing each URL with its keywords.
Args:
http: A pckage inside PoolManager() able to send GET requests
web_url: url of the website whose availability is required
Returns:
bool: Depending upon the response of GET request, this function will return a bool indicating availability of web_url
"""
results = {}
client = boto3.client('s3')
for line in open('s3://urls-to-monitor/URLs1T.txt', transport_params={'client': client}):
if line[len(line)-1] != '/':
url = line[:len(line)-2]
else: url = line
if validation(url) == False:
continue
try:
web_content = scrape_web(url)
results[url] = web_content
except:
continue
return results
def validation(url):
"""
Validates the URL's string. This method use regular expressions for validation at backend.
Args:
url: URL to validate
Returns:
bool: True if the passes string is a valid URL and False otherwise.
"""
return validators.url(url)
def scrape_web(url):
"""
This function scrapes a given URL's web page for a specific set of keywords.
Args:
url: Page's URL to be scraped
Return:
filtered_words: A refined list of extracted words from the web page.
"""
try:
res = requests.get(url, timeout=2)
except:
raise ValueError
if res.status_code != 200:
raise ValueError
html_page = res.content
soup = remove_tags(html_page)
content = soup.get_text()
words = re.split(r"\s+|/", content.lower())
filtered_words = clean_wordlist(words)
return tuple(filtered_words)
def remove_tags(html):
"""
Remove the specified tags from HTML response recieved from request.get() method.
Args:
html: HTML response of the web page
Returns:
soup: Parsed response of HTML
"""
# parse html content
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script', 'noscript']):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return soup
def clean_wordlist(wordlist):
"""
This function removes any punctuation marks and stop words from our extracted wordlist.
Args:
wordlist: A list of raw words extracted from html response of web page.
Returns:
key_words: A filtered list of words containing only key words
"""
words_without_symbol = []
for word in wordlist:
#Symbols to ignore
symbols = "!@#$%^&*()_-+={[}]|\;:\"<>?/., "
for i in range(len(symbols)):
word = word.replace(symbols[i], '')
if len(word) > 0:
words_without_symbol.append(word)
#ignoring the stopwords
key_words = [word for word in words_without_symbol if not word in stopwords.words()]
return key_words
请教各位大神,请问为什么时差这么大,我该如何缩小。
唯一可以影响性能的配置是内存分配。尝试增加为您的函数分配的内存,直到您的性能至少与 Collab 相同。
计费应该不会有太大影响,因为它是根据内存和执行时间的乘积计算的。