从网页中读取 table 时,如何在 Python 3.x 上使用 Pandas 实际上 omit/ignore 这些只是单元格组合的行?
When reading a table from a webpage, how to actually omit/ignore such rows that are just a combination of cells using Pandas on Python 3.x?
假设您有兴趣从 this page 中保存 经济日历 table 而没有日期(table):
于是你写了下面的代码:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
import numpy as np
#the variable that will store the selenium options
opt = Options()
#this allows selenium to take control of your Chrome Browser in DevTools mode.
opt.add_experimental_option("debuggerAddress", "localhost:9222")
#Use the chrome driver located at the corresponding path
s = Service(r'C:\Users\ResetStoreX\AppData\Local\Programs\Python\Python39\Scripts\chromedriver.exe')
#execute the chrome driver with the previous conditions
driver = webdriver.Chrome(service=s, options=opt)
def wait_xpath(code): #function to wait for the element to be located by its XPATH
WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.XPATH, code)))
#go to investing.com to check the economic calendar
driver.get('https://www.investing.com/economic-calendar/')
#wait for the economic calendar table to be located
wait_xpath('/html/body/div[5]/section/div[6]/table')
#wait for the information to load completely
WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr')))
#store the table body information
table_body = driver.find_element(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody')
#store the cells of the table in a list as WebElements
cells = table_body.find_elements(By.TAG_NAME, 'td')
#actual cell list containing the row in string format
cell_list = []
#column names
column_names = ["Time", "Currency", "Volatility expected", "Event", "Actual", "Forecast", "Previous"]
#convert the cells to human readable format and add them to the cell_list
for row in cells[1:]:
cell_list.append(row.text)
#delete the element that appears every 8 elements in the array
cell_list = [word for idx, word in enumerate(cell_list, 1) if idx % 8 != 0]
#reshape the array into an array of unknown arrays and 7 columns
cell_list = np.array(cell_list).reshape(-1, 7).tolist()
#create a dataframe including the column names
df = pd.DataFrame(cell_list, columns=column_names)
#store the volatilities expected (those which are measured with stars)
volatilities_expected = table_body.find_elements(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr/td[3]')
#actual volatilities list containing the row in string format
volatility_list = []
#convert the volatilities expected to human readable format and add them to the volatility list
for volatility in volatilities_expected:
volatility_list.append(volatility.get_attribute('title'))
#reshape the array into an array of unknown cell and 7 columns
volatility_list = np.array(volatility_list).reshape(-1, 1).tolist()
#add the volatility list to the volatility expected column
df['Volatility expected'] = [v[0] for v in volatility_list]
编译后,您将得到以下输出(今天):
到目前为止,一切似乎都是正确的,但是,当在 明天 那天尝试使用与上面相同的代码但没有 driver.get('https://www.investing.com/economic-calendar/')
句子时:
您注意到有一个新行 这只是一个组合单元格 通知明天将是印度的假期,全天。
因此,程序抛出错误:
ValueError: cannot reshape array of size 312 into shape (7)
本来用于创建 df
的列表最终损坏了:
那么,您如何设法处理这些由组合单元格组成的意外假期,以便像第一个示例那样构建正确的 df
?
我明白了,我不得不重写大部分代码,for row in row_list:
和 volatility_list.remove('')
本质上是获得我需要的东西的关键:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
import numpy as np
#the variable that will store the selenium options
opt = Options()
#this allows selenium to take control of your Chrome Browser in DevTools mode.
opt.add_experimental_option("debuggerAddress", "localhost:9222")
#Use the chrome driver located at the corresponding path
s = Service(r'C:\Users\ResetStoreX\AppData\Local\Programs\Python\Python39\Scripts\chromedriver.exe')
#execute the chrome driver with the previous conditions
driver = webdriver.Chrome(service=s, options=opt)
def wait_xpath(code): #function to wait for the element to be located by its XPATH
WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.XPATH, code)))
#go to investing.com to check the economic calendar
driver.get('https://www.investing.com/economic-calendar/')
#wait for the economic calendar table to be located
wait_xpath('/html/body/div[5]/section/div[6]/table')
#wait for the information to load completely
WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr')))
#count starts at 2
i = 2
#lenght of the table
table_lenght = len(driver.find_elements(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr'))
#actual row list containing rows as lists in string format
row_list = []
#column names
column_names = ["Time", "Currency", "Volatility expected", "Event", "Actual", "Forecast", "Previous"]
#add each row as list to the row_list
while i <= table_lenght:
cells_in_a_row = driver.find_elements(By.XPATH, f'/html/body/div[5]/section/div[6]/table/tbody/tr[{i}]/td')
for index, cell in enumerate(cells_in_a_row, start=0):
cells_in_a_row[index] = cell.text
row_list.append(cells_in_a_row)
i += 1
#count removed rows
removed_rows = 0
#delete undesired rows
for row in row_list:
if len(row) != 8:
row_list.remove(row)
removed_rows += 1
#delete the last element in every list in the row_list
for row in row_list:
row.pop()
#reshape the array into an array of unknown arrays and 7 columns
row_list = np.array(row_list).reshape(-1, 7).tolist()
#if there are any cells that have "min", update those cells with the actual time value
for index, x in enumerate(row_list, start=0):
if "min" in x[0]:
print(f"This row: {index} {x}")
y = index + 2 + removed_rows
actual_time = driver.find_element(By.XPATH, f'/html/body/div[5]/section/div[6]/table/tbody/tr[{y}]').get_attribute('data-event-datetime')
actual_time = actual_time[11:16]
x[0] = actual_time
print(f"Was updated with these other row: {x} ")
#create a dataframe including the column names
df = pd.DataFrame(row_list, columns=column_names)
#store the table body information
table_body = driver.find_element(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody')
#store the volatilities expected (those which are measured with stars)
volatilities_expected = table_body.find_elements(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr/td[3]')
#actual volatilities list containing the row in string format
volatility_list = []
#convert the volatilities expected to human readable format and add them to the volatility list
for volatility in volatilities_expected:
volatility_list.append(volatility.get_attribute('title'))
#remove undesired values
volatility_list.remove('')
#reshape the array into an array of unknown cell and 7 columns
volatility_list = np.array(volatility_list).reshape(-1, 1).tolist()
#add the volatility list to the volatility expected column
df['Volatility expected'] = [v[0] for v in volatility_list]
>> 在[2]中:df
>>输出[2]:
Time Currency Volatility expected ... Actual Forecast Previous
0 00:30 JPY Moderate Volatility Expected ... -0.7% 0.1%
1 02:30 JPY High Volatility Expected ...
2 05:00 EUR Low Volatility Expected ... -5.052B 1.103B
3 05:00 EUR Low Volatility Expected ... -0.89B -3.80B
4 06:00 EUR Moderate Volatility Expected ... 1.50% 2.20%
5 06:00 EUR Low Volatility Expected ... 1.90% 2.30%
6 06:00 EUR Moderate Volatility Expected ... -27.2B -4.8B
7 06:30 RUB High Volatility Expected ... 20.00% 20.00% 20.00%
8 07:30 INR Low Volatility Expected ... 622.28B 631.92B
9 08:00 BRL Moderate Volatility Expected ... 11.2% 11.4% 11.1%
10 08:00 RUB Moderate Volatility Expected ...
11 08:30 CAD High Volatility Expected ... 2.5% 2.4% -2.7%
12 08:30 CAD Moderate Volatility Expected ... 13.49B 37.54B
13 08:30 CAD Low Volatility Expected ... -14.42B 21.29B
14 08:30 CAD Moderate Volatility Expected ... 1.1% 0.9%
15 08:30 CAD Moderate Volatility Expected ... 3.2% 2.4% -2.0%
16 10:00 USD Moderate Volatility Expected ... -7.2% -1.0% 6.6%
17 10:00 USD High Volatility Expected ... 6.02M 6.10M 6.49M
18 10:00 USD Low Volatility Expected ... 0.3% 0.3% -0.5%
19 12:30 USD Low Volatility Expected ...
20 13:00 USD Moderate Volatility Expected ... 524 527
21 13:00 USD Moderate Volatility Expected ... 663 663
22 14:00 USD Low Volatility Expected ...
23 15:00 USD Moderate Volatility Expected ...
24 15:30 GBP Moderate Volatility Expected ... -29.1K -12.5K
25 15:30 USD Low Volatility Expected ... 2.2K 2.5K
26 15:30 USD Low Volatility Expected ... 19.0K 31.8K
27 15:30 USD Low Volatility Expected ... 507.2K 498.0K
28 15:30 USD Moderate Volatility Expected ... 341.8K 361.7K
29 15:30 USD Moderate Volatility Expected ... 261.8K 274.4K
30 15:30 USD Moderate Volatility Expected ... 19.0K 26.6K
31 15:30 USD Low Volatility Expected ... -146.6K -138.4K
32 15:30 USD Moderate Volatility Expected ... 102.2K 127.7K
33 15:30 USD Low Volatility Expected ... 51.6K 52.3K
34 15:30 USD Low Volatility Expected ... 217.6K 216.6K
35 15:30 USD Low Volatility Expected ... 10.9K 12.6K
36 15:30 CAD Low Volatility Expected ... 17.7K 7.6K
37 15:30 CHF Low Volatility Expected ... -5.2K -9.7K
38 15:30 AUD Moderate Volatility Expected ... -44.9K -78.2K
39 15:30 BRL Moderate Volatility Expected ... 44.2K 50.5K
40 15:30 JPY Moderate Volatility Expected ... -62.3K -55.9K
41 15:30 NZD Low Volatility Expected ... 3.7K -12.4K
42 15:30 RUB Moderate Volatility Expected ... 7.5K 7.8K
43 15:30 EUR Moderate Volatility Expected ... 18.8K 58.8K
[44 rows x 7 columns]
假设您有兴趣从 this page 中保存 经济日历 table 而没有日期(table):
于是你写了下面的代码:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
import numpy as np
#the variable that will store the selenium options
opt = Options()
#this allows selenium to take control of your Chrome Browser in DevTools mode.
opt.add_experimental_option("debuggerAddress", "localhost:9222")
#Use the chrome driver located at the corresponding path
s = Service(r'C:\Users\ResetStoreX\AppData\Local\Programs\Python\Python39\Scripts\chromedriver.exe')
#execute the chrome driver with the previous conditions
driver = webdriver.Chrome(service=s, options=opt)
def wait_xpath(code): #function to wait for the element to be located by its XPATH
WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.XPATH, code)))
#go to investing.com to check the economic calendar
driver.get('https://www.investing.com/economic-calendar/')
#wait for the economic calendar table to be located
wait_xpath('/html/body/div[5]/section/div[6]/table')
#wait for the information to load completely
WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr')))
#store the table body information
table_body = driver.find_element(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody')
#store the cells of the table in a list as WebElements
cells = table_body.find_elements(By.TAG_NAME, 'td')
#actual cell list containing the row in string format
cell_list = []
#column names
column_names = ["Time", "Currency", "Volatility expected", "Event", "Actual", "Forecast", "Previous"]
#convert the cells to human readable format and add them to the cell_list
for row in cells[1:]:
cell_list.append(row.text)
#delete the element that appears every 8 elements in the array
cell_list = [word for idx, word in enumerate(cell_list, 1) if idx % 8 != 0]
#reshape the array into an array of unknown arrays and 7 columns
cell_list = np.array(cell_list).reshape(-1, 7).tolist()
#create a dataframe including the column names
df = pd.DataFrame(cell_list, columns=column_names)
#store the volatilities expected (those which are measured with stars)
volatilities_expected = table_body.find_elements(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr/td[3]')
#actual volatilities list containing the row in string format
volatility_list = []
#convert the volatilities expected to human readable format and add them to the volatility list
for volatility in volatilities_expected:
volatility_list.append(volatility.get_attribute('title'))
#reshape the array into an array of unknown cell and 7 columns
volatility_list = np.array(volatility_list).reshape(-1, 1).tolist()
#add the volatility list to the volatility expected column
df['Volatility expected'] = [v[0] for v in volatility_list]
编译后,您将得到以下输出(今天):
到目前为止,一切似乎都是正确的,但是,当在 明天 那天尝试使用与上面相同的代码但没有 driver.get('https://www.investing.com/economic-calendar/')
句子时:
您注意到有一个新行 这只是一个组合单元格 通知明天将是印度的假期,全天。
因此,程序抛出错误:
ValueError: cannot reshape array of size 312 into shape (7)
本来用于创建 df
的列表最终损坏了:
那么,您如何设法处理这些由组合单元格组成的意外假期,以便像第一个示例那样构建正确的 df
?
我明白了,我不得不重写大部分代码,for row in row_list:
和 volatility_list.remove('')
本质上是获得我需要的东西的关键:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
import numpy as np
#the variable that will store the selenium options
opt = Options()
#this allows selenium to take control of your Chrome Browser in DevTools mode.
opt.add_experimental_option("debuggerAddress", "localhost:9222")
#Use the chrome driver located at the corresponding path
s = Service(r'C:\Users\ResetStoreX\AppData\Local\Programs\Python\Python39\Scripts\chromedriver.exe')
#execute the chrome driver with the previous conditions
driver = webdriver.Chrome(service=s, options=opt)
def wait_xpath(code): #function to wait for the element to be located by its XPATH
WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.XPATH, code)))
#go to investing.com to check the economic calendar
driver.get('https://www.investing.com/economic-calendar/')
#wait for the economic calendar table to be located
wait_xpath('/html/body/div[5]/section/div[6]/table')
#wait for the information to load completely
WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr')))
#count starts at 2
i = 2
#lenght of the table
table_lenght = len(driver.find_elements(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr'))
#actual row list containing rows as lists in string format
row_list = []
#column names
column_names = ["Time", "Currency", "Volatility expected", "Event", "Actual", "Forecast", "Previous"]
#add each row as list to the row_list
while i <= table_lenght:
cells_in_a_row = driver.find_elements(By.XPATH, f'/html/body/div[5]/section/div[6]/table/tbody/tr[{i}]/td')
for index, cell in enumerate(cells_in_a_row, start=0):
cells_in_a_row[index] = cell.text
row_list.append(cells_in_a_row)
i += 1
#count removed rows
removed_rows = 0
#delete undesired rows
for row in row_list:
if len(row) != 8:
row_list.remove(row)
removed_rows += 1
#delete the last element in every list in the row_list
for row in row_list:
row.pop()
#reshape the array into an array of unknown arrays and 7 columns
row_list = np.array(row_list).reshape(-1, 7).tolist()
#if there are any cells that have "min", update those cells with the actual time value
for index, x in enumerate(row_list, start=0):
if "min" in x[0]:
print(f"This row: {index} {x}")
y = index + 2 + removed_rows
actual_time = driver.find_element(By.XPATH, f'/html/body/div[5]/section/div[6]/table/tbody/tr[{y}]').get_attribute('data-event-datetime')
actual_time = actual_time[11:16]
x[0] = actual_time
print(f"Was updated with these other row: {x} ")
#create a dataframe including the column names
df = pd.DataFrame(row_list, columns=column_names)
#store the table body information
table_body = driver.find_element(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody')
#store the volatilities expected (those which are measured with stars)
volatilities_expected = table_body.find_elements(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr/td[3]')
#actual volatilities list containing the row in string format
volatility_list = []
#convert the volatilities expected to human readable format and add them to the volatility list
for volatility in volatilities_expected:
volatility_list.append(volatility.get_attribute('title'))
#remove undesired values
volatility_list.remove('')
#reshape the array into an array of unknown cell and 7 columns
volatility_list = np.array(volatility_list).reshape(-1, 1).tolist()
#add the volatility list to the volatility expected column
df['Volatility expected'] = [v[0] for v in volatility_list]
>> 在[2]中:df
>>输出[2]:
Time Currency Volatility expected ... Actual Forecast Previous
0 00:30 JPY Moderate Volatility Expected ... -0.7% 0.1%
1 02:30 JPY High Volatility Expected ...
2 05:00 EUR Low Volatility Expected ... -5.052B 1.103B
3 05:00 EUR Low Volatility Expected ... -0.89B -3.80B
4 06:00 EUR Moderate Volatility Expected ... 1.50% 2.20%
5 06:00 EUR Low Volatility Expected ... 1.90% 2.30%
6 06:00 EUR Moderate Volatility Expected ... -27.2B -4.8B
7 06:30 RUB High Volatility Expected ... 20.00% 20.00% 20.00%
8 07:30 INR Low Volatility Expected ... 622.28B 631.92B
9 08:00 BRL Moderate Volatility Expected ... 11.2% 11.4% 11.1%
10 08:00 RUB Moderate Volatility Expected ...
11 08:30 CAD High Volatility Expected ... 2.5% 2.4% -2.7%
12 08:30 CAD Moderate Volatility Expected ... 13.49B 37.54B
13 08:30 CAD Low Volatility Expected ... -14.42B 21.29B
14 08:30 CAD Moderate Volatility Expected ... 1.1% 0.9%
15 08:30 CAD Moderate Volatility Expected ... 3.2% 2.4% -2.0%
16 10:00 USD Moderate Volatility Expected ... -7.2% -1.0% 6.6%
17 10:00 USD High Volatility Expected ... 6.02M 6.10M 6.49M
18 10:00 USD Low Volatility Expected ... 0.3% 0.3% -0.5%
19 12:30 USD Low Volatility Expected ...
20 13:00 USD Moderate Volatility Expected ... 524 527
21 13:00 USD Moderate Volatility Expected ... 663 663
22 14:00 USD Low Volatility Expected ...
23 15:00 USD Moderate Volatility Expected ...
24 15:30 GBP Moderate Volatility Expected ... -29.1K -12.5K
25 15:30 USD Low Volatility Expected ... 2.2K 2.5K
26 15:30 USD Low Volatility Expected ... 19.0K 31.8K
27 15:30 USD Low Volatility Expected ... 507.2K 498.0K
28 15:30 USD Moderate Volatility Expected ... 341.8K 361.7K
29 15:30 USD Moderate Volatility Expected ... 261.8K 274.4K
30 15:30 USD Moderate Volatility Expected ... 19.0K 26.6K
31 15:30 USD Low Volatility Expected ... -146.6K -138.4K
32 15:30 USD Moderate Volatility Expected ... 102.2K 127.7K
33 15:30 USD Low Volatility Expected ... 51.6K 52.3K
34 15:30 USD Low Volatility Expected ... 217.6K 216.6K
35 15:30 USD Low Volatility Expected ... 10.9K 12.6K
36 15:30 CAD Low Volatility Expected ... 17.7K 7.6K
37 15:30 CHF Low Volatility Expected ... -5.2K -9.7K
38 15:30 AUD Moderate Volatility Expected ... -44.9K -78.2K
39 15:30 BRL Moderate Volatility Expected ... 44.2K 50.5K
40 15:30 JPY Moderate Volatility Expected ... -62.3K -55.9K
41 15:30 NZD Low Volatility Expected ... 3.7K -12.4K
42 15:30 RUB Moderate Volatility Expected ... 7.5K 7.8K
43 15:30 EUR Moderate Volatility Expected ... 18.8K 58.8K
[44 rows x 7 columns]