如何根据网络抓取的输出创建带有空白单元格的 pandas 数据框?
How can I create a pandas dataframe with blank cells based on the output of webscraping?
我正在尝试使用以下代码抓取网页,但是,由于 'mis-matched' 行,我收到错误消息。我想要实现的是一个 pandas 数据框,其中包含课程名称,然后是全日制代码,全日制 URL,兼职代码,兼职 URL.问题是并非所有课程都有全日制和非全日制课程,所以当我试图用“NA”替换空白以获得相同的行数时,它会产生错误。
以下代码提供所有课程的全日制和非全日制课程的输出,并且此代码不会产生错误,因为它只允许包含所有 5 个元素的课程:
#Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
#Specify URL
url = "http://eecs.qmul.ac.uk/postgraduate/programmes"
html = urlopen(url)
# Print the first 10 table rows
rows = soup.find_all('tr')
print(rows[:10])
#Create data frame
df = pd.DataFrame(columns = ['Course Name', 'Part Time Code', 'Part Time URL', 'Full Time Code', 'Full Time URL'])
#Create loop to go through all rows
for row in rows:
courses = row.find_all("td")
# The fragments list will store things to be included in the final string, such as the course title and its URLs
fragments = []
for course in courses:
if course.text.isspace():
continue
# Add the <td>'s text to fragments
fragments.append(course.text)
# Try and find an <a> tag
a_tag = course.find("a")
if a_tag:
# If one was found, add the URL to fragments
fragments.append(a_tag["href"])
# Make a string containing every fragment with ", " spacing them apart.
cleantext = ", ".join(fragments)
#Add rows to the dataframe if the information exists
if len(fragments) == 5:
df.loc[len(df.index)] = fragments
df.head(30)
这是输出:
这是我用来尝试用 NA 替换空白以确保每行中有 5 个元素的方法:
#Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
#Specify URL
url = "http://eecs.qmul.ac.uk/postgraduate/programmes"
html = urlopen(url)
# Print the first 10 table rows
rows = soup.find_all('tr')
#Create data frame
df = pd.DataFrame(columns = ['Course Name', 'Part Time Code', 'Part Time URL', 'Full Time Code', 'Full Time URL'])
#Create loop to go through all rows
for row in rows:
courses = row.find_all("td")
# The fragments list will store things to be included in the final string, such as the course title and its URLs
fragments = []
for course in courses:
if course.text.isspace():
fragments.append("NA")
else:
# Add the <td>'s text to fragments
fragments.append(course.text)
# Try and find an <a> tag
a_tag = course.find("a")
if a_tag:
# If one was found, add the URL to fragments
fragments.append(a_tag["href"])
else:
fragments.append("NA")
# Make a string containing every fragment with ", " spacing them apart.
cleantext = ", ".join(fragments)
#Add rows to the dataframe if the information exists
if len(fragments) > 0:
df.loc[len(df.index)] = fragments
df.head(30)
这是返回的错误:
ValueError Traceback (most recent call last)
<ipython-input-28-94bb08463416> in <module>()
38 #Add rows to the dataframe if the information exists
39 if len(fragments) > 0:
---> 40 df.loc[len(df.index)] = fragments
41 df.head(30)
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _setitem_with_indexer_missing(self, indexer, value)
1854 # must have conforming columns
1855 if len(value) != len(self.obj.columns):
-> 1856 raise ValueError("cannot set a row with mismatched columns")
1857
1858 value = Series(value, index=self.obj.columns, name=indexer)
ValueError: cannot set a row with mismatched columns
您能否确定我如何解决此问题,以便没有兼职代码或 URL 的课程仍包含在数据框中?
比这简单得多。通过 id 找到 table,然后将 prettify
-ed 版本直接输入 Pandas IO。 Pandas 开箱即用地处理 NaN。
soup = BeautifulSoup(urlopen('http://eecs.qmul.ac.uk/postgraduate/programmes'))
table = soup.find("table", {"id":"PGCourse"})
df = pd.read_html(table.prettify())[0]
# rename columns
df.columns = ['Course Name', 'Part Time Code', 'Full Time Code']
编辑:好的,然后要获取您确实需要迭代的链接:
pt_links, ft_links = [], []
for row in table.find_all("tr")[1:]:
row_data = row.find_all("td")
pt, ft = row_data[1], row_data[2]
pt_link = pt.find_all('a')
pt_links.append('' if len(pt_link) == 0 else pt_link[0]['href'])
ft_link = ft.find_all('a')
ft_links.append('' if len(ft_link) == 0 else ft_link[0]['href'])
df['Part Time URL'] = pt_links
df['Full Time URL'] = ft_links
# rearrange the columns (optional)
df = df[['Course Name', 'Part Time Code', 'Part Time URL', 'Full Time Code', 'Full Time URL']]
我正在尝试使用以下代码抓取网页,但是,由于 'mis-matched' 行,我收到错误消息。我想要实现的是一个 pandas 数据框,其中包含课程名称,然后是全日制代码,全日制 URL,兼职代码,兼职 URL.问题是并非所有课程都有全日制和非全日制课程,所以当我试图用“NA”替换空白以获得相同的行数时,它会产生错误。
以下代码提供所有课程的全日制和非全日制课程的输出,并且此代码不会产生错误,因为它只允许包含所有 5 个元素的课程:
#Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
#Specify URL
url = "http://eecs.qmul.ac.uk/postgraduate/programmes"
html = urlopen(url)
# Print the first 10 table rows
rows = soup.find_all('tr')
print(rows[:10])
#Create data frame
df = pd.DataFrame(columns = ['Course Name', 'Part Time Code', 'Part Time URL', 'Full Time Code', 'Full Time URL'])
#Create loop to go through all rows
for row in rows:
courses = row.find_all("td")
# The fragments list will store things to be included in the final string, such as the course title and its URLs
fragments = []
for course in courses:
if course.text.isspace():
continue
# Add the <td>'s text to fragments
fragments.append(course.text)
# Try and find an <a> tag
a_tag = course.find("a")
if a_tag:
# If one was found, add the URL to fragments
fragments.append(a_tag["href"])
# Make a string containing every fragment with ", " spacing them apart.
cleantext = ", ".join(fragments)
#Add rows to the dataframe if the information exists
if len(fragments) == 5:
df.loc[len(df.index)] = fragments
df.head(30)
这是输出:
这是我用来尝试用 NA 替换空白以确保每行中有 5 个元素的方法:
#Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
#Specify URL
url = "http://eecs.qmul.ac.uk/postgraduate/programmes"
html = urlopen(url)
# Print the first 10 table rows
rows = soup.find_all('tr')
#Create data frame
df = pd.DataFrame(columns = ['Course Name', 'Part Time Code', 'Part Time URL', 'Full Time Code', 'Full Time URL'])
#Create loop to go through all rows
for row in rows:
courses = row.find_all("td")
# The fragments list will store things to be included in the final string, such as the course title and its URLs
fragments = []
for course in courses:
if course.text.isspace():
fragments.append("NA")
else:
# Add the <td>'s text to fragments
fragments.append(course.text)
# Try and find an <a> tag
a_tag = course.find("a")
if a_tag:
# If one was found, add the URL to fragments
fragments.append(a_tag["href"])
else:
fragments.append("NA")
# Make a string containing every fragment with ", " spacing them apart.
cleantext = ", ".join(fragments)
#Add rows to the dataframe if the information exists
if len(fragments) > 0:
df.loc[len(df.index)] = fragments
df.head(30)
这是返回的错误:
ValueError Traceback (most recent call last)
<ipython-input-28-94bb08463416> in <module>()
38 #Add rows to the dataframe if the information exists
39 if len(fragments) > 0:
---> 40 df.loc[len(df.index)] = fragments
41 df.head(30)
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _setitem_with_indexer_missing(self, indexer, value)
1854 # must have conforming columns
1855 if len(value) != len(self.obj.columns):
-> 1856 raise ValueError("cannot set a row with mismatched columns")
1857
1858 value = Series(value, index=self.obj.columns, name=indexer)
ValueError: cannot set a row with mismatched columns
您能否确定我如何解决此问题,以便没有兼职代码或 URL 的课程仍包含在数据框中?
比这简单得多。通过 id 找到 table,然后将 prettify
-ed 版本直接输入 Pandas IO。 Pandas 开箱即用地处理 NaN。
soup = BeautifulSoup(urlopen('http://eecs.qmul.ac.uk/postgraduate/programmes'))
table = soup.find("table", {"id":"PGCourse"})
df = pd.read_html(table.prettify())[0]
# rename columns
df.columns = ['Course Name', 'Part Time Code', 'Full Time Code']
编辑:好的,然后要获取您确实需要迭代的链接:
pt_links, ft_links = [], []
for row in table.find_all("tr")[1:]:
row_data = row.find_all("td")
pt, ft = row_data[1], row_data[2]
pt_link = pt.find_all('a')
pt_links.append('' if len(pt_link) == 0 else pt_link[0]['href'])
ft_link = ft.find_all('a')
ft_links.append('' if len(ft_link) == 0 else ft_link[0]['href'])
df['Part Time URL'] = pt_links
df['Full Time URL'] = ft_links
# rearrange the columns (optional)
df = df[['Course Name', 'Part Time Code', 'Part Time URL', 'Full Time Code', 'Full Time URL']]