从网页中读取 table 时，如何在 Python 3.x 上使用 Pandas 实际上 omit/ignore 这些只是单元格组合的行？

Question

假设您有兴趣从 this page 中保存 经济日历 table 而没有日期（table):

于是你写了下面的代码：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

#the variable that will store the selenium options
opt = Options()
#this allows selenium to take control of your Chrome Browser in DevTools mode.
opt.add_experimental_option("debuggerAddress", "localhost:9222") 
#Use the chrome driver located at the corresponding path
s = Service(r'C:\Users\ResetStoreX\AppData\Local\Programs\Python\Python39\Scripts\chromedriver.exe')
#execute the chrome driver with the previous conditions
driver = webdriver.Chrome(service=s, options=opt) 

def wait_xpath(code): #function to wait for the element to be located by its XPATH
    WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.XPATH, code)))

#go to investing.com to check the economic calendar
driver.get('https://www.investing.com/economic-calendar/')

#wait for the economic calendar table to be located
wait_xpath('/html/body/div[5]/section/div[6]/table')

#wait for the information to load completely
WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr')))

#store the table body information
table_body = driver.find_element(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody')

#store the cells of the table in a list as WebElements
cells = table_body.find_elements(By.TAG_NAME, 'td')

#actual cell list containing the row in string format
cell_list = []

#column names
column_names = ["Time", "Currency", "Volatility expected", "Event", "Actual", "Forecast", "Previous"]

#convert the cells to human readable format and add them to the cell_list
for row in cells[1:]:
    cell_list.append(row.text)
    
#delete the element that appears every 8 elements in the array
cell_list = [word for idx, word in enumerate(cell_list, 1) if idx % 8 != 0]

#reshape the array into an array of unknown arrays and 7 columns
cell_list = np.array(cell_list).reshape(-1, 7).tolist()

#create a dataframe including the column names
df = pd.DataFrame(cell_list, columns=column_names)

#store the volatilities expected (those which are measured with stars)
volatilities_expected = table_body.find_elements(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr/td[3]')

#actual volatilities list containing the row in string format
volatility_list = []

#convert the volatilities expected to human readable format and add them to the volatility list
for volatility in volatilities_expected:
    volatility_list.append(volatility.get_attribute('title'))
    
#reshape the array into an array of unknown cell and 7 columns
volatility_list = np.array(volatility_list).reshape(-1, 1).tolist()

#add the volatility list to the volatility expected column
df['Volatility expected'] = [v[0] for v in volatility_list]

编译后，您将得到以下输出（今天）：

到目前为止，一切似乎都是正确的，但是，当在明天那天尝试使用与上面相同的代码但没有 driver.get('https://www.investing.com/economic-calendar/') 句子时：

您注意到有一个新行 这只是一个组合单元格 通知明天将是印度的假期，全天。

因此，程序抛出错误：

ValueError: cannot reshape array of size 312 into shape (7)

本来用于创建 df 的列表最终损坏了：

那么，您如何设法处理这些由组合单元格组成的意外假期，以便像第一个示例那样构建正确的 df？

Answer 1

我明白了，我不得不重写大部分代码，for row in row_list: 和 volatility_list.remove('') 本质上是获得我需要的东西的关键：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

#the variable that will store the selenium options
opt = Options()
#this allows selenium to take control of your Chrome Browser in DevTools mode.
opt.add_experimental_option("debuggerAddress", "localhost:9222") 
#Use the chrome driver located at the corresponding path
s = Service(r'C:\Users\ResetStoreX\AppData\Local\Programs\Python\Python39\Scripts\chromedriver.exe')
#execute the chrome driver with the previous conditions
driver = webdriver.Chrome(service=s, options=opt) 

def wait_xpath(code): #function to wait for the element to be located by its XPATH
    WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.XPATH, code)))

#go to investing.com to check the economic calendar
driver.get('https://www.investing.com/economic-calendar/')

#wait for the economic calendar table to be located
wait_xpath('/html/body/div[5]/section/div[6]/table')

#wait for the information to load completely
WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr')))

#count starts at 2
i = 2

#lenght of the table
table_lenght = len(driver.find_elements(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr'))
    
#actual row list containing rows as lists in string format
row_list = []

#column names
column_names = ["Time", "Currency", "Volatility expected", "Event", "Actual", "Forecast", "Previous"]

#add each row as list to the row_list
while i <= table_lenght:
    cells_in_a_row = driver.find_elements(By.XPATH, f'/html/body/div[5]/section/div[6]/table/tbody/tr[{i}]/td')
    for index, cell in enumerate(cells_in_a_row, start=0):
        cells_in_a_row[index] = cell.text
    row_list.append(cells_in_a_row)
    i += 1
    
#count removed rows
removed_rows = 0

#delete undesired rows
for row in row_list:
    if len(row) != 8:
        row_list.remove(row)
        removed_rows += 1
    
#delete the last element in every list in the row_list
for row in row_list:
    row.pop()

#reshape the array into an array of unknown arrays and 7 columns
row_list = np.array(row_list).reshape(-1, 7).tolist()

#if there are any cells that have "min", update those cells with the actual time value
for index, x in enumerate(row_list, start=0):
    if "min" in x[0]:
        print(f"This row: {index} {x}")
        y = index + 2 + removed_rows
        actual_time = driver.find_element(By.XPATH, f'/html/body/div[5]/section/div[6]/table/tbody/tr[{y}]').get_attribute('data-event-datetime')
        actual_time = actual_time[11:16]
        x[0] = actual_time
        print(f"Was updated with these other row: {x} ")

#create a dataframe including the column names
df = pd.DataFrame(row_list, columns=column_names)

#store the table body information
table_body = driver.find_element(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody')

#store the volatilities expected (those which are measured with stars)
volatilities_expected = table_body.find_elements(By.XPATH, '/html/body/div[5]/section/div[6]/table/tbody/tr/td[3]')

#actual volatilities list containing the row in string format
volatility_list = []

#convert the volatilities expected to human readable format and add them to the volatility list
for volatility in volatilities_expected:
    volatility_list.append(volatility.get_attribute('title'))

#remove undesired values
volatility_list.remove('')

#reshape the array into an array of unknown cell and 7 columns
volatility_list = np.array(volatility_list).reshape(-1, 1).tolist()

#add the volatility list to the volatility expected column
df['Volatility expected'] = [v[0] for v in volatility_list]

>> 在[2]中：df

>>输出[2]:

     Time Currency           Volatility expected  ...   Actual Forecast Previous
0   00:30      JPY  Moderate Volatility Expected  ...    -0.7%              0.1%
1   02:30      JPY      High Volatility Expected  ...                           
2   05:00      EUR       Low Volatility Expected  ...  -5.052B            1.103B
3   05:00      EUR       Low Volatility Expected  ...   -0.89B            -3.80B
4   06:00      EUR  Moderate Volatility Expected  ...    1.50%             2.20%
5   06:00      EUR       Low Volatility Expected  ...    1.90%             2.30%
6   06:00      EUR  Moderate Volatility Expected  ...   -27.2B             -4.8B
7   06:30      RUB      High Volatility Expected  ...   20.00%   20.00%   20.00%
8   07:30      INR       Low Volatility Expected  ...  622.28B           631.92B
9   08:00      BRL  Moderate Volatility Expected  ...    11.2%    11.4%    11.1%
10  08:00      RUB  Moderate Volatility Expected  ...                           
11  08:30      CAD      High Volatility Expected  ...     2.5%     2.4%    -2.7%
12  08:30      CAD  Moderate Volatility Expected  ...   13.49B            37.54B
13  08:30      CAD       Low Volatility Expected  ...  -14.42B            21.29B
14  08:30      CAD  Moderate Volatility Expected  ...     1.1%              0.9%
15  08:30      CAD  Moderate Volatility Expected  ...     3.2%     2.4%    -2.0%
16  10:00      USD  Moderate Volatility Expected  ...    -7.2%    -1.0%     6.6%
17  10:00      USD      High Volatility Expected  ...    6.02M    6.10M    6.49M
18  10:00      USD       Low Volatility Expected  ...     0.3%     0.3%    -0.5%
19  12:30      USD       Low Volatility Expected  ...                           
20  13:00      USD  Moderate Volatility Expected  ...      524               527
21  13:00      USD  Moderate Volatility Expected  ...      663               663
22  14:00      USD       Low Volatility Expected  ...                           
23  15:00      USD  Moderate Volatility Expected  ...                           
24  15:30      GBP  Moderate Volatility Expected  ...   -29.1K            -12.5K
25  15:30      USD       Low Volatility Expected  ...     2.2K              2.5K
26  15:30      USD       Low Volatility Expected  ...    19.0K             31.8K
27  15:30      USD       Low Volatility Expected  ...   507.2K            498.0K
28  15:30      USD  Moderate Volatility Expected  ...   341.8K            361.7K
29  15:30      USD  Moderate Volatility Expected  ...   261.8K            274.4K
30  15:30      USD  Moderate Volatility Expected  ...    19.0K             26.6K
31  15:30      USD       Low Volatility Expected  ...  -146.6K           -138.4K
32  15:30      USD  Moderate Volatility Expected  ...   102.2K            127.7K
33  15:30      USD       Low Volatility Expected  ...    51.6K             52.3K
34  15:30      USD       Low Volatility Expected  ...   217.6K            216.6K
35  15:30      USD       Low Volatility Expected  ...    10.9K             12.6K
36  15:30      CAD       Low Volatility Expected  ...    17.7K              7.6K
37  15:30      CHF       Low Volatility Expected  ...    -5.2K             -9.7K
38  15:30      AUD  Moderate Volatility Expected  ...   -44.9K            -78.2K
39  15:30      BRL  Moderate Volatility Expected  ...    44.2K             50.5K
40  15:30      JPY  Moderate Volatility Expected  ...   -62.3K            -55.9K
41  15:30      NZD       Low Volatility Expected  ...     3.7K            -12.4K
42  15:30      RUB  Moderate Volatility Expected  ...     7.5K              7.8K
43  15:30      EUR  Moderate Volatility Expected  ...    18.8K             58.8K

[44 rows x 7 columns]

从网页中读取 table 时，如何在 Python 3.x 上使用 Pandas 实际上 omit/ignore 这些只是单元格组合的行？

When reading a table from a webpage, how to actually omit/ignore such rows that are just a combination of cells using Pandas on Python 3.x?

debugging

if-statement

dataframe

python-3.x

pandas