Python bs4 + lxml解析table
Python bs4 + lxml parsing table
我想从这个 url - http://portal.ksada.org:8090/time-table/student?id=5598 解析 table。
我最后需要得到的是某种数据结构。例如,我试图实现的目标:
class Schedule():
date='02.02.2022' # headdate class in html
day='Ср' # headday class in html
lessons=[['1 пара #span lesson', '09:00-10:35', 'КомпКн[Пз]', 'ауд. 304', 'Чайка Л.Е.'],
[...],] # div with class lessons-1 or lessons-2
所以有了它,我会确切地知道有一天会有很多课程。也许这不是最好的解决方案,也许这就是我卡住的原因。
总的来说,我想要的是将所有这些东西结构化,这样我就可以得到一天、一周和一个月的课程。我尝试了很多解决方案,但只是停留在那里。
我现在拥有的是这段代码:
url = 'http://portal.ksada.org:8090/time-table/student?id='
id = 5598
def get_data(url, id):
page = requests.get(url+id)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.select_one('table')
items = []
for tr in table.select('tr'):
th_list = tr.select('th')
td_list = tr.select('td')
for th in th_list:
print(th.text)
for td in td_list:
print(td.text.strip().replace(' ', ''))
我也试过找出每一天之间的“距离”,像这样:
def get_data(url, id):
page = requests.get(url+id)
soup = BeautifulSoup(page.text, "html.parser")
table = soup.find('table')
tbody = table.find_all('tr')
for i, t in enumerate(tbody):
if t.find('th', class_='headday'):
days.append(i)
并像这样使用它:
for i, d in enumerate(days[:-1]):
for t in tbody[days[i]:days[i+1]]:
我只是不知道如何把它做得很好。
我希望这能帮助您找到最终的解决方案。
#Import the library - pandas
import pandas as pd
table_list=pd.read_html('http://portal.ksada.org:8090/time-table/student?id=5598',attrs = {'id': 'timeTable'},flavor='lxml')
df = table_list[0].replace(r' ','NoValue', regex=True) # replace the value with NoValue, in case needed further
df_header=['Day','W1','W2','W3','W4','W5']
df.columns=df_header # logical header
df.head(2) # this can be commented out as this is only for data viewing
As pandas 将第一行读取为 header,将其转换为第一行数据。
#converting header to first row data
df_t=pd.DataFrame(columns=df_header, data=[table_list[0].columns.tolist()])
这是将用于数据需求的最终数据框。
df_final=df_t.append(df, ignore_index=True)
df_final.head(5) # # this can be commented out as this is only for data viewing
#setup for group weeks
week_days_notation=['Пн','Вт','Ср','Чт','Пт','Сб','Нд']
day_of_week=""
week_days=[]
for e in df_final['Day']:
if e in week_days_notation:
day_of_week=e
week_days.append(day_of_week)
#week_days
# add the week_days to the dataframe
df_final.insert(0,'week_group',week_days)
df_final.head(2)
#group by week
df_final_grp=df_final.groupby('week_group')
# now can get week and iterate in case needed
# give me only 'Wednesday':'Ср'
wed_classes=df_final_grp.get_group('Ср')
wed_classes.head(10)
其他用例:将持续时间移动到缺少它的 class。
要将 class 间隔从 Day
移动到 W1
,W2
左右可以通过应用和正则表达式函数完成。
def add_date_range(row,coln):
if "NoValue" in row[coln]: # if NoValue in any week day return spaces
return " "
day=row['Day']
regexp = re.compile(r'(\d{2}:\d{2}(.*)\d{2}:\d{2})+')
r_m=regexp.search(day) # find if 14:45 format is there in Day!
if r_m: # if 14:45 format is there if True
class_duration=r_m[0] # get duration 14:45 16:20 from Day
r_w=regexp.search(row[coln]) # search time in Week
if r_w:
return row[coln]
else:
return f"{r_m[0]} {row[coln]}" # append duration with original value
else:
return row[coln] # pass the original value
df_final['W1']=df_final.apply(add_date_range,args=("W1",),axis=1)
df_final['W2']=df_final.apply(add_date_range,args=("W2",),axis=1)
df_final['W3']=df_final.apply(add_date_range,args=("W3",),axis=1)
df_final['W4']=df_final.apply(add_date_range,args=("W4",),axis=1)
df_final['W5']=df_final.apply(add_date_range,args=("W5",),axis=1)
然后应用组。
#group by week
df_final_grp=df_final.groupby('week_group')
我想从这个 url - http://portal.ksada.org:8090/time-table/student?id=5598 解析 table。 我最后需要得到的是某种数据结构。例如,我试图实现的目标:
class Schedule():
date='02.02.2022' # headdate class in html
day='Ср' # headday class in html
lessons=[['1 пара #span lesson', '09:00-10:35', 'КомпКн[Пз]', 'ауд. 304', 'Чайка Л.Е.'],
[...],] # div with class lessons-1 or lessons-2
所以有了它,我会确切地知道有一天会有很多课程。也许这不是最好的解决方案,也许这就是我卡住的原因。 总的来说,我想要的是将所有这些东西结构化,这样我就可以得到一天、一周和一个月的课程。我尝试了很多解决方案,但只是停留在那里。 我现在拥有的是这段代码:
url = 'http://portal.ksada.org:8090/time-table/student?id='
id = 5598
def get_data(url, id):
page = requests.get(url+id)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.select_one('table')
items = []
for tr in table.select('tr'):
th_list = tr.select('th')
td_list = tr.select('td')
for th in th_list:
print(th.text)
for td in td_list:
print(td.text.strip().replace(' ', ''))
我也试过找出每一天之间的“距离”,像这样:
def get_data(url, id):
page = requests.get(url+id)
soup = BeautifulSoup(page.text, "html.parser")
table = soup.find('table')
tbody = table.find_all('tr')
for i, t in enumerate(tbody):
if t.find('th', class_='headday'):
days.append(i)
并像这样使用它:
for i, d in enumerate(days[:-1]):
for t in tbody[days[i]:days[i+1]]:
我只是不知道如何把它做得很好。
我希望这能帮助您找到最终的解决方案。
#Import the library - pandas
import pandas as pd
table_list=pd.read_html('http://portal.ksada.org:8090/time-table/student?id=5598',attrs = {'id': 'timeTable'},flavor='lxml')
df = table_list[0].replace(r' ','NoValue', regex=True) # replace the value with NoValue, in case needed further
df_header=['Day','W1','W2','W3','W4','W5']
df.columns=df_header # logical header
df.head(2) # this can be commented out as this is only for data viewing
As pandas 将第一行读取为 header,将其转换为第一行数据。
#converting header to first row data
df_t=pd.DataFrame(columns=df_header, data=[table_list[0].columns.tolist()])
这是将用于数据需求的最终数据框。
df_final=df_t.append(df, ignore_index=True)
df_final.head(5) # # this can be commented out as this is only for data viewing
#setup for group weeks
week_days_notation=['Пн','Вт','Ср','Чт','Пт','Сб','Нд']
day_of_week=""
week_days=[]
for e in df_final['Day']:
if e in week_days_notation:
day_of_week=e
week_days.append(day_of_week)
#week_days
# add the week_days to the dataframe
df_final.insert(0,'week_group',week_days)
df_final.head(2)
#group by week
df_final_grp=df_final.groupby('week_group')
# now can get week and iterate in case needed
# give me only 'Wednesday':'Ср'
wed_classes=df_final_grp.get_group('Ср')
wed_classes.head(10)
其他用例:将持续时间移动到缺少它的 class。
要将 class 间隔从 Day
移动到 W1
,W2
左右可以通过应用和正则表达式函数完成。
def add_date_range(row,coln):
if "NoValue" in row[coln]: # if NoValue in any week day return spaces
return " "
day=row['Day']
regexp = re.compile(r'(\d{2}:\d{2}(.*)\d{2}:\d{2})+')
r_m=regexp.search(day) # find if 14:45 format is there in Day!
if r_m: # if 14:45 format is there if True
class_duration=r_m[0] # get duration 14:45 16:20 from Day
r_w=regexp.search(row[coln]) # search time in Week
if r_w:
return row[coln]
else:
return f"{r_m[0]} {row[coln]}" # append duration with original value
else:
return row[coln] # pass the original value
df_final['W1']=df_final.apply(add_date_range,args=("W1",),axis=1)
df_final['W2']=df_final.apply(add_date_range,args=("W2",),axis=1)
df_final['W3']=df_final.apply(add_date_range,args=("W3",),axis=1)
df_final['W4']=df_final.apply(add_date_range,args=("W4",),axis=1)
df_final['W5']=df_final.apply(add_date_range,args=("W5",),axis=1)
然后应用组。
#group by week
df_final_grp=df_final.groupby('week_group')