如何添加计时器来计算我的代码执行
How to add a timer to calculate my code execution
我写了一个 python 脚本,它从 aliexpress 抓取产品。
这是我的代码:
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from pymongo import MongoClient
from time import sleep
from lxml import html
import pandas as pd
import cssselect
import pymongo
import json
import csv
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones<ype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'
for page_nb in range(1, 2):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
stars = product.xpath('.//span[@class="eXPaM"]/text()')
if stars :
stars = stars [0]
else:
stars = 'None'
nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = 'None'
supl = product.xpath('.//a[@class="ox0KZ"]/text()')
if supl:
supl = supl[0]
else:
supl = 'None'
ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = 'None'
product_links = product.xpath('./@href')
if product_links:
product_links = str(baseurl) + str( product_links[0])
row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
results.append(row)
print('len(results):', len(results))
driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))
####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")
collection = client['db2']['aliex2']
data = df.to_dict(orient = 'records')
collection.insert_many(data)
我的问题:
我需要的是添加一个计时器来计算处理时间和 returns 一个值来知道代码需要多少时间。
而且我还想要一种方法,以便在每次抓取后获取新集合,因为当我第二次 运行 我的代码时,我使用旧集合获取数据。
感谢您的帮助。谢谢!
希望您的问题在下面的代码中得到解决:
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from pymongo import MongoClient
from time import sleep
from lxml import html
import pandas as pd
import cssselect
import pymongo
import json
import csv
import time as Time
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones<ype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'
for page_nb in range(1, 2):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@class="JIIxO"]//a'):
start_time = Time.time()
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
stars = product.xpath('.//span[@class="eXPaM"]/text()')
if stars :
stars = stars [0]
else:
stars = 'None'
nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = 'None'
supl = product.xpath('.//a[@class="ox0KZ"]/text()')
if supl:
supl = supl[0]
else:
supl = 'None'
ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = 'None'
product_links = product.xpath('./@href')
if product_links:
product_links = str(baseurl) + str( product_links[0])
diffrence_time = Time.time() - start_time # calculate time taken by program
row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links, diffrence_time] #diffrence_time store dataframe
results.append(row)
print('len(results):', len(results))
driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks", "Time Taken"))
####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")
collection = client['db2']['aliex2']
data = df.to_dict(orient = 'records')
collection.insert_many(data)
我已经找出程序在循环中花费的时间并将该差异时间存储在数据帧中
我写了一个 python 脚本,它从 aliexpress 抓取产品。
这是我的代码:
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from pymongo import MongoClient
from time import sleep
from lxml import html
import pandas as pd
import cssselect
import pymongo
import json
import csv
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones<ype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'
for page_nb in range(1, 2):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
stars = product.xpath('.//span[@class="eXPaM"]/text()')
if stars :
stars = stars [0]
else:
stars = 'None'
nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = 'None'
supl = product.xpath('.//a[@class="ox0KZ"]/text()')
if supl:
supl = supl[0]
else:
supl = 'None'
ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = 'None'
product_links = product.xpath('./@href')
if product_links:
product_links = str(baseurl) + str( product_links[0])
row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
results.append(row)
print('len(results):', len(results))
driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))
####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")
collection = client['db2']['aliex2']
data = df.to_dict(orient = 'records')
collection.insert_many(data)
我的问题:
我需要的是添加一个计时器来计算处理时间和 returns 一个值来知道代码需要多少时间。 而且我还想要一种方法,以便在每次抓取后获取新集合,因为当我第二次 运行 我的代码时,我使用旧集合获取数据。
感谢您的帮助。谢谢!
希望您的问题在下面的代码中得到解决:
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from pymongo import MongoClient
from time import sleep
from lxml import html
import pandas as pd
import cssselect
import pymongo
import json
import csv
import time as Time
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones<ype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'
for page_nb in range(1, 2):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@class="JIIxO"]//a'):
start_time = Time.time()
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
stars = product.xpath('.//span[@class="eXPaM"]/text()')
if stars :
stars = stars [0]
else:
stars = 'None'
nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = 'None'
supl = product.xpath('.//a[@class="ox0KZ"]/text()')
if supl:
supl = supl[0]
else:
supl = 'None'
ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = 'None'
product_links = product.xpath('./@href')
if product_links:
product_links = str(baseurl) + str( product_links[0])
diffrence_time = Time.time() - start_time # calculate time taken by program
row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links, diffrence_time] #diffrence_time store dataframe
results.append(row)
print('len(results):', len(results))
driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks", "Time Taken"))
####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")
collection = client['db2']['aliex2']
data = df.to_dict(orient = 'records')
collection.insert_many(data)
我已经找出程序在循环中花费的时间并将该差异时间存储在数据帧中