Pandas:将所有 re.search 结果从 BeautifulSoup 写入 csv
Pandas: Write all re.search results to csv from BeautifulSoup
我有一个 Python pandas
脚本的这些开头,该脚本在 Google 中搜索值并获取它可以在第一页上找到的任何 PDF 链接。
我有两个问题,列在下面。
import pandas as pd
from bs4 import BeautifulSoup
import urllib2
import re
df = pd.DataFrame(["Shakespeare", "Beowulf"], columns=["Search"])
print "Searching for PDFs ..."
hdr = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive"}
def crawl(search):
google = "http://www.google.com/search?q="
url = google + search + "+" + "PDF"
req = urllib2.Request(url, headers=hdr)
pdf_links = None
placeholder = None #just a column placeholder
try:
page = urllib2.urlopen(req).read()
soup = BeautifulSoup(page)
cite = soup.find_all("cite", attrs={"class":"_Rm"})
for link in cite:
all_links = re.search(r".+", link.text).group().encode("utf-8")
if all_links.endswith(".pdf"):
pdf_links = re.search(r"(.+)pdf$", all_links).group()
print pdf_links
except urllib2.HTTPError, e:
print e.fp.read()
return pd.Series([pdf_links, placeholder])
df[["PDF links", "Placeholder"]] = df["Search"].apply(crawl)
df.to_csv(FileName, index=False, delimiter=",")
print pdf_links
的结果将是:
davidlucking.com/documents/Shakespeare-Complete%20Works.pdf
sparks.eserver.org/books/shakespeare-tempest.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
calhoun.k12.il.us/teachers/wdeffenbaugh/.../Shakespeare%20Sonnets.pdf
www.yorku.ca/inpar/Beowulf_Child.pdf
www.yorku.ca/inpar/Beowulf_Child.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
www.penguin.com/static/pdf/.../beowulf.pdf
www.neshaminy.org/cms/lib6/.../380/text.pdf
www.neshaminy.org/cms/lib6/.../380/text.pdf
sparks.eserver.org/books/beowulf.pdf
csv 输出如下所示:
Search PDF Links
Shakespeare calhoun.k12.il.us/teachers/wdeffenbaugh/.../Shakespeare%20Sonnets.pdf
Beowulf sparks.eserver.org/books/beowulf.pdf
问题:
- 有没有办法将所有结果作为行写入 csv 而不是
只是最下面的那个?如果可能,为对应于
"Shakespeare"
或 "Beowulf"
? 的每一行包含 Search
中的值
- 如何写出完整的 pdf 链接,而不会自动使用
"..."
缩写长链接?
这将使用 soup.find_all("a",href=True)
为您提供所有正确的 pdf 链接,并将它们保存在 Dataframe 和 csv 中:
hdr = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive"}
def crawl(columns=None, *search):
df = pd.DataFrame(columns= columns)
for term in search:
google = "http://www.google.com/search?q="
url = google + term + "+" + "PDF"
req = urllib2.Request(url, headers=hdr)
try:
page = urllib2.urlopen(req).read()
soup = BeautifulSoup(page)
pdfs = []
links = soup.find_all("a",href=True)
for link in links:
lk = link["href"]
if lk.endswith(".pdf"):
pdfs.append((term, lk))
df2 = pd.DataFrame(pdfs, columns=columns)
df = df.append(df2, ignore_index=True)
except urllib2.HTTPError, e:
print e.fp.read()
return df
df = crawl(["Search", "PDF link"],"Shakespeare","Beowulf")
df.to_csv("out.csv",index=False)
out.csv:
Search,PDF link
Shakespeare,http://davidlucking.com/documents/Shakespeare-Complete%20Works.pdf
Shakespeare,http://www.w3.org/People/maxf/XSLideMaker/hamlet.pdf
Shakespeare,http://sparks.eserver.org/books/shakespeare-tempest.pdf
Shakespeare,https://phillipkay.files.wordpress.com/2011/07/william-shakespeare-plays.pdf
Shakespeare,http://www.artsvivants.ca/pdf/eth/activities/shakespeare_overview.pdf
Shakespeare,http://triggs.djvu.org/djvu-editions.com/SHAKESPEARE/SONNETS/Download.pdf
Beowulf,http://www.yorku.ca/inpar/Beowulf_Child.pdf
Beowulf,https://is.muni.cz/el/1441/podzim2013/AJ2RC_STAL/2._Beowulf.pdf
Beowulf,http://teacherweb.com/IL/Steinmetz/MottramM/Beowulf---Seamus-Heaney.pdf
Beowulf,http://www.penguin.com/static/pdf/teachersguides/beowulf.pdf
Beowulf,http://www.neshaminy.org/cms/lib6/PA01000466/Centricity/Domain/380/text.pdf
Beowulf,http://www.sparknotes.com/free-pdfs/uscellular/download/beowulf.pdf
要获取 PDF 链接,您正在寻找这些选择器:
for result in soup.select('.tF2Cxc'):
# check if PDF is present via according CSS class OR use try/except instead
if result.select_one('.ZGwO7'):
pdf_file = result.select_one('.yuRUbf a')['href']
CSS
选择器 reference. Have a look at SelectorGadget Chrome 通过在浏览器中单击所需的元素来获取 CSS
选择器的扩展 。
要将它们保存到 CSV
,您正在寻找这个:
# store all links from a for loop
pdfs = []
# create PDF Link column and append PDF links from a pdfs list()
df = pd.DataFrame({'PDF Link': pdfs})
# save to csv and delete default pandas index column. Done!
df.to_csv('PDFs.csv', index=False)
代码和example in the online IDE(也展示了如何保存到本地):
from bs4 import BeautifulSoup
import requests, lxml
import pandas as pd
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "best lasagna recipe:pdf"
}
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
pdfs = []
for result in soup.select('.tF2Cxc'):
# check if PDF is present via according CSS class
if result.select_one('.ZGwO7'):
pdf_file = result.select_one('.yuRUbf a')['href']
pdfs.append(pdf_file)
# creates PDF Link column and appends PDF links from a pdfs list()
df = pd.DataFrame({'PDF Link': pdfs})
df.to_csv('Bs4_PDFs.csv', index=False)
-----------
# from CSV
'''
PDF Link
http://www.bakersedge.com/PDF/Lasagna.pdf
http://greatgreens.ca/recipes/Recipe%20-%20Worlds%20Best%20Lasagna.pdf
https://liparifoods.com/wp-content/uploads/2015/10/lipari-foods-holiday-recipes.pdf
...
'''
或者,您可以使用 SerpApi 中的 Google Organic Results API 来实现相同的目的。这是付费 API 和免费计划。
你的情况的不同之处在于,不是从头开始创建所有东西,找出为什么某些东西不能按预期工作,然后随着时间的推移维护它,你需要做的就是迭代结构化 JSON并得到你想要的数据。它也可能更具可读性,并且可以快速理解代码内部发生的事情。
与您的示例集成的代码:
from serpapi import GoogleSearch
import os
import pandas as pd
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "best lasagna recipe:pdf",
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
pdfs = []
# iterate over organic results and check if .pdf file type exists in link
for result in results['organic_results']:
if '.pdf' in result['link']:
pdf_file = result['link']
pdfs.append(pdf_file)
df = pd.DataFrame({'PDF Link': pdfs})
df.to_csv('SerpApi_PDFs.csv', index=False)
-----------
# from CSV
'''
PDF Link
http://www.bakersedge.com/PDF/Lasagna.pdf
http://greatgreens.ca/recipes/Recipe%20-%20Worlds%20Best%20Lasagna.pdf
https://liparifoods.com/wp-content/uploads/2015/10/lipari-foods-holiday-recipes.pdf
...
'''
Disclaimer, I work for SerpApi.
我有一个 Python pandas
脚本的这些开头,该脚本在 Google 中搜索值并获取它可以在第一页上找到的任何 PDF 链接。
我有两个问题,列在下面。
import pandas as pd
from bs4 import BeautifulSoup
import urllib2
import re
df = pd.DataFrame(["Shakespeare", "Beowulf"], columns=["Search"])
print "Searching for PDFs ..."
hdr = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive"}
def crawl(search):
google = "http://www.google.com/search?q="
url = google + search + "+" + "PDF"
req = urllib2.Request(url, headers=hdr)
pdf_links = None
placeholder = None #just a column placeholder
try:
page = urllib2.urlopen(req).read()
soup = BeautifulSoup(page)
cite = soup.find_all("cite", attrs={"class":"_Rm"})
for link in cite:
all_links = re.search(r".+", link.text).group().encode("utf-8")
if all_links.endswith(".pdf"):
pdf_links = re.search(r"(.+)pdf$", all_links).group()
print pdf_links
except urllib2.HTTPError, e:
print e.fp.read()
return pd.Series([pdf_links, placeholder])
df[["PDF links", "Placeholder"]] = df["Search"].apply(crawl)
df.to_csv(FileName, index=False, delimiter=",")
print pdf_links
的结果将是:
davidlucking.com/documents/Shakespeare-Complete%20Works.pdf
sparks.eserver.org/books/shakespeare-tempest.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
calhoun.k12.il.us/teachers/wdeffenbaugh/.../Shakespeare%20Sonnets.pdf
www.yorku.ca/inpar/Beowulf_Child.pdf
www.yorku.ca/inpar/Beowulf_Child.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
www.penguin.com/static/pdf/.../beowulf.pdf
www.neshaminy.org/cms/lib6/.../380/text.pdf
www.neshaminy.org/cms/lib6/.../380/text.pdf
sparks.eserver.org/books/beowulf.pdf
csv 输出如下所示:
Search PDF Links
Shakespeare calhoun.k12.il.us/teachers/wdeffenbaugh/.../Shakespeare%20Sonnets.pdf
Beowulf sparks.eserver.org/books/beowulf.pdf
问题:
- 有没有办法将所有结果作为行写入 csv 而不是
只是最下面的那个?如果可能,为对应于
"Shakespeare"
或"Beowulf"
? 的每一行包含 - 如何写出完整的 pdf 链接,而不会自动使用
"..."
缩写长链接?
Search
中的值
这将使用 soup.find_all("a",href=True)
为您提供所有正确的 pdf 链接,并将它们保存在 Dataframe 和 csv 中:
hdr = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive"}
def crawl(columns=None, *search):
df = pd.DataFrame(columns= columns)
for term in search:
google = "http://www.google.com/search?q="
url = google + term + "+" + "PDF"
req = urllib2.Request(url, headers=hdr)
try:
page = urllib2.urlopen(req).read()
soup = BeautifulSoup(page)
pdfs = []
links = soup.find_all("a",href=True)
for link in links:
lk = link["href"]
if lk.endswith(".pdf"):
pdfs.append((term, lk))
df2 = pd.DataFrame(pdfs, columns=columns)
df = df.append(df2, ignore_index=True)
except urllib2.HTTPError, e:
print e.fp.read()
return df
df = crawl(["Search", "PDF link"],"Shakespeare","Beowulf")
df.to_csv("out.csv",index=False)
out.csv:
Search,PDF link
Shakespeare,http://davidlucking.com/documents/Shakespeare-Complete%20Works.pdf
Shakespeare,http://www.w3.org/People/maxf/XSLideMaker/hamlet.pdf
Shakespeare,http://sparks.eserver.org/books/shakespeare-tempest.pdf
Shakespeare,https://phillipkay.files.wordpress.com/2011/07/william-shakespeare-plays.pdf
Shakespeare,http://www.artsvivants.ca/pdf/eth/activities/shakespeare_overview.pdf
Shakespeare,http://triggs.djvu.org/djvu-editions.com/SHAKESPEARE/SONNETS/Download.pdf
Beowulf,http://www.yorku.ca/inpar/Beowulf_Child.pdf
Beowulf,https://is.muni.cz/el/1441/podzim2013/AJ2RC_STAL/2._Beowulf.pdf
Beowulf,http://teacherweb.com/IL/Steinmetz/MottramM/Beowulf---Seamus-Heaney.pdf
Beowulf,http://www.penguin.com/static/pdf/teachersguides/beowulf.pdf
Beowulf,http://www.neshaminy.org/cms/lib6/PA01000466/Centricity/Domain/380/text.pdf
Beowulf,http://www.sparknotes.com/free-pdfs/uscellular/download/beowulf.pdf
要获取 PDF 链接,您正在寻找这些选择器:
for result in soup.select('.tF2Cxc'):
# check if PDF is present via according CSS class OR use try/except instead
if result.select_one('.ZGwO7'):
pdf_file = result.select_one('.yuRUbf a')['href']
CSS
选择器 reference. Have a look at SelectorGadget Chrome 通过在浏览器中单击所需的元素来获取 CSS
选择器的扩展 。
要将它们保存到 CSV
,您正在寻找这个:
# store all links from a for loop
pdfs = []
# create PDF Link column and append PDF links from a pdfs list()
df = pd.DataFrame({'PDF Link': pdfs})
# save to csv and delete default pandas index column. Done!
df.to_csv('PDFs.csv', index=False)
代码和example in the online IDE(也展示了如何保存到本地):
from bs4 import BeautifulSoup
import requests, lxml
import pandas as pd
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "best lasagna recipe:pdf"
}
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
pdfs = []
for result in soup.select('.tF2Cxc'):
# check if PDF is present via according CSS class
if result.select_one('.ZGwO7'):
pdf_file = result.select_one('.yuRUbf a')['href']
pdfs.append(pdf_file)
# creates PDF Link column and appends PDF links from a pdfs list()
df = pd.DataFrame({'PDF Link': pdfs})
df.to_csv('Bs4_PDFs.csv', index=False)
-----------
# from CSV
'''
PDF Link
http://www.bakersedge.com/PDF/Lasagna.pdf
http://greatgreens.ca/recipes/Recipe%20-%20Worlds%20Best%20Lasagna.pdf
https://liparifoods.com/wp-content/uploads/2015/10/lipari-foods-holiday-recipes.pdf
...
'''
或者,您可以使用 SerpApi 中的 Google Organic Results API 来实现相同的目的。这是付费 API 和免费计划。
你的情况的不同之处在于,不是从头开始创建所有东西,找出为什么某些东西不能按预期工作,然后随着时间的推移维护它,你需要做的就是迭代结构化 JSON并得到你想要的数据。它也可能更具可读性,并且可以快速理解代码内部发生的事情。
与您的示例集成的代码:
from serpapi import GoogleSearch
import os
import pandas as pd
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "best lasagna recipe:pdf",
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
pdfs = []
# iterate over organic results and check if .pdf file type exists in link
for result in results['organic_results']:
if '.pdf' in result['link']:
pdf_file = result['link']
pdfs.append(pdf_file)
df = pd.DataFrame({'PDF Link': pdfs})
df.to_csv('SerpApi_PDFs.csv', index=False)
-----------
# from CSV
'''
PDF Link
http://www.bakersedge.com/PDF/Lasagna.pdf
http://greatgreens.ca/recipes/Recipe%20-%20Worlds%20Best%20Lasagna.pdf
https://liparifoods.com/wp-content/uploads/2015/10/lipari-foods-holiday-recipes.pdf
...
'''
Disclaimer, I work for SerpApi.