BeautifulSoup - Scraping a comment when the ID field changes


对于这个问题,我特意想办法把包含裁判和比赛数据的评论拉出来。请注意,这些 html 文件现在存储在本地,因此我正在尝试遍历一个文件夹。在源代码中它看起来像这样:

           <div class="section_wrapper setup_commented commented" id="all_342042674">
<div class="section_heading">
  <span class="section_anchor" id="342042674_link" data-label="Other Info"></span>
    <h2>Other Info</h2>    <div class="section_heading_text">
</div><div class="placeholder"></div>
    <div class="section_content" id="div_342042674">
<div><strong>Umpires:</strong> HP - Greg Gibson, 1B - Jerry Layne, 2B - Jordan Baker, 3B - Vic Carapazza.</div><div><strong>Time of Game:</strong> 3:21.</div>
<div><strong>Attendance:</strong> 33,809.</div>
<div><strong>Start Time Weather:</strong> 70&deg; F, Wind 6mph out to Centerfield, Night, No Precipitation.</div>



如您所见,它在评论中。真正的挑战在于 ID 值会随着场地和季节的变化而变化。我正在解析 10 年的数据。谁能告诉我如何在 ID 实际更改时提取评论文本?

# import libraries and files
from bs4 import BeautifulSoup, Comment
import os


# Setup Games list for append
games = []

path = r"D:\My Web Sites\baseball 2\\boxes\ANA"

for filename in os.listdir(path):
    if filename.endswith(".html"):
        fullpath = os.path.join(path, filename)

        print 'Processing {:}...'.format(fullpath)

# Get Page, Make Soup
    soup = BeautifulSoup(open(fullpath), 'lxml')

# Setting up game object to append to list
    game = {}

# Get Description
    # Note:  Skip every other child because of 'Navigable Strings' from BS.  
    divs = soup.findAll('div', {'scorebox_meta'})
    for div in divs:
        for idx, child in enumerate(div.children):
            if idx == 1:
                game['date'] = child.text
            elif idx == 3:
                game['start_time'] = child.text.split(':', 1)[1].strip()
            elif idx == 7:
                game['venue'] = child.text.split(':', 1)[1].strip()
            elif idx == 9:
                game['duration'] = child.text.split(':', 1)[1].strip()

# Get Player Data from tables
    for comment in soup.find_all(string=lambda text:isinstance(text,Comment)):
         data = BeautifulSoup(comment,"lxml")
         for items in"table tr"):
             player_data = [' '.join(item.text.split()) for item in"th,td")]
             print '======================================================='

# Get Umpire Data        

# Append game data to full list        


print 'Results'
print '*' * 80

# Print the games harvested to the console

for idx, game in enumerate(games):
    print str(idx) + ':  ' + str(game)

# Write to CSV
csvfile = "C:/Users/Benny/Desktop/anatest.csv"

with open(csvfile, "w") as output:
    writer = csv.writer(output, lineterminator='\n')

非常感谢, 本尼


from bs4 import BeautifulSoup
import re

data = """<div class="section_wrapper setup_commented commented" id="all_342042674">
<div class="section_heading">
  <span class="section_anchor" id="342042674_link" data-label="Other Info"></span>
    <h2>Other Info</h2>    <div class="section_heading_text">
</div><div class="placeholder"></div>
    <div class="section_content" id="div_342042674">
<div><strong>Umpires:</strong> HP - Greg Gibson, 1B - Jerry Layne, 2B - Jordan Baker, 3B - Vic Carapazza.</div>
<div><strong>Time of Game:</strong> 3:21.</div>
<div><strong>Attendance:</strong> 33,809.</div>
<div><strong>Start Time Weather:</strong> 70&deg; F, Wind 6mph out to Centerfield, Night, No Precipitation.</div>



soup = BeautifulSoup('(?<=<!--)(.*?)(?=-->)', data, flags=re.DOTALL)[0], 'lxml')

umpires, time_of_game, attendance, start_time_weather ='div.section_content > div')

print('ID: ', soup.find('div', class_="section_content")['id'])
print('umpires: ', umpires.text)
print('time of game: ', time_of_game.text)
print('attendance: ', attendance.text)
print('start_time_weather: ', start_time_weather.text)


ID:  div_342042674
umpires:  Umpires: HP - Greg Gibson, 1B - Jerry Layne, 2B - Jordan Baker, 3B - Vic Carapazza.
time of game:  Time of Game: 3:21.
attendance:  Attendance: 33,809.
start_time_weather:  Start Time Weather: 70° F, Wind 6mph out to Centerfield, Night, No Precipitation.


import requests
from bs4 import BeautifulSoup

url = ""

res = requests.get(url)
content = res.text.replace("<!--","").replace("-->","")
soup = BeautifulSoup(content,"lxml")
umpire, gametime, attendance, weather = soup.find_all(class_="section_content")[2]("strong")


 HP - Greg Gibson, 1B - Jerry Layne, 2B - Jordan Baker, 3B - Vic Carapazza.
 70° F, Wind 6mph out to Centerfield, Night, No Precipitation.