BeautifulSoup 绝对 URL 打印为 CSV
BeautifulSoup Absoute URLs Print to CSV
我在这里浏览了大量线程,看看是否可以找到修复此代码的方法,但似乎无法让它正常工作。我正在尝试从网站上抓取链接然后写入 csv。这是代码:
我找到了一种方法可以到达那里的 95%,但是我缺少一些东西来获取 href:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import csv
j = urllib.request.urlopen("http://cnn.com")
soup = BeautifulSoup(j, "lxml")
data = soup.find_all('a', href=True)
for url in soup.find_all('a', href=True):
#print(url.get('href'))
with open('marcel.csv', 'w', newline='') as csvfile:
write = csv.writer(csvfile)
write.writerows(data)
我用openpyxl
来获取
from openpyxl import Workbook,load_workbook
我觉得很简单。
这是我项目的一部分,你可以试试
def createExcel(self):
wb = Workbook(optimized_write=True)
ws = wb.create_sheet(title='书籍列表')
row0 = ['编号','条码号','题名','责任者','借阅日期','归还日期','馆藏地']
ws.append(row0)
save_path = 'book_hist.xlsx'
wb.save(save_path)
def saveToExcel(self,data_list):
wb = load_workbook(filename='book_hist.xlsx')
ws = wb.get_sheet_by_name('书籍列表')
for i in range(len(data_list)):
ws.append(data_list[i])
save_path = 'book_hist.xlsx'
wb.save(save_path)
这可能是您想要做的。
from bs4 import BeautifulSoup
import requests #better than urllib
import csv
j = requests.get("http://cnn.com").content
soup = BeautifulSoup(j, "lxml")
data = []
for url in soup.find_all('a', href=True):
print(url['href'])
data.append(url['href'])
print(data)
with open("marcel.csv",'w') as csvfile:
write = csv.writer(csvfile, delimiter = ' ')
write.writerows(data)
我在这里浏览了大量线程,看看是否可以找到修复此代码的方法,但似乎无法让它正常工作。我正在尝试从网站上抓取链接然后写入 csv。这是代码:
我找到了一种方法可以到达那里的 95%,但是我缺少一些东西来获取 href:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import csv
j = urllib.request.urlopen("http://cnn.com")
soup = BeautifulSoup(j, "lxml")
data = soup.find_all('a', href=True)
for url in soup.find_all('a', href=True):
#print(url.get('href'))
with open('marcel.csv', 'w', newline='') as csvfile:
write = csv.writer(csvfile)
write.writerows(data)
我用openpyxl
来获取
from openpyxl import Workbook,load_workbook
我觉得很简单。 这是我项目的一部分,你可以试试
def createExcel(self):
wb = Workbook(optimized_write=True)
ws = wb.create_sheet(title='书籍列表')
row0 = ['编号','条码号','题名','责任者','借阅日期','归还日期','馆藏地']
ws.append(row0)
save_path = 'book_hist.xlsx'
wb.save(save_path)
def saveToExcel(self,data_list):
wb = load_workbook(filename='book_hist.xlsx')
ws = wb.get_sheet_by_name('书籍列表')
for i in range(len(data_list)):
ws.append(data_list[i])
save_path = 'book_hist.xlsx'
wb.save(save_path)
这可能是您想要做的。
from bs4 import BeautifulSoup
import requests #better than urllib
import csv
j = requests.get("http://cnn.com").content
soup = BeautifulSoup(j, "lxml")
data = []
for url in soup.find_all('a', href=True):
print(url['href'])
data.append(url['href'])
print(data)
with open("marcel.csv",'w') as csvfile:
write = csv.writer(csvfile, delimiter = ' ')
write.writerows(data)