从网络抓取循环中附加新列和行
Appending new column and rows from web scrape loop
我可以在代码末尾添加什么,以将项目列表作为第 1 列附加到数据框中,并将生成的 webscrape 数据从 for 循环附加到第 2 列,并将第 1 列的每个列表项与 webscrape 相匹配数据?我正在尝试获取 -
col1 col2
url1 A Details
url2 B Details
我的代码是,
urls = ['url1','url2']
from bs4 import BeautifulSoup
import requests
from time import sleep
from random import randint
for link in urls:
sleep(randint(11,23))
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',}
req = requests.get(link, headers = headers)
soup = BeautifulSoup(req.content, features="lxml")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
print(text)
最简单的方法是将数据附加到循环内的列表,然后从两个列表创建数据框。
from bs4 import BeautifulSoup
import requests
from time import sleep
from random import randint
#initializes empty lists to append data
url_list = []
details_list = []
for link in urls:
sleep(randint(11,23))
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',}
req = requests.get(link, headers = headers)
soup = BeautifulSoup(req.content, features="lxml")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
url_list.append(link) # appends links
details_list.append(text) # appends text
results_df = pd.DataFrame() # creates empty dataframe
results_df['col1'] = url_list # adds column to dataframe with URLs
results_df['col2'] = details_list # adds column to dataframe with details
我可以在代码末尾添加什么,以将项目列表作为第 1 列附加到数据框中,并将生成的 webscrape 数据从 for 循环附加到第 2 列,并将第 1 列的每个列表项与 webscrape 相匹配数据?我正在尝试获取 -
col1 col2
url1 A Details
url2 B Details
我的代码是,
urls = ['url1','url2']
from bs4 import BeautifulSoup
import requests
from time import sleep
from random import randint
for link in urls:
sleep(randint(11,23))
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',}
req = requests.get(link, headers = headers)
soup = BeautifulSoup(req.content, features="lxml")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
print(text)
最简单的方法是将数据附加到循环内的列表,然后从两个列表创建数据框。
from bs4 import BeautifulSoup
import requests
from time import sleep
from random import randint
#initializes empty lists to append data
url_list = []
details_list = []
for link in urls:
sleep(randint(11,23))
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',}
req = requests.get(link, headers = headers)
soup = BeautifulSoup(req.content, features="lxml")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
url_list.append(link) # appends links
details_list.append(text) # appends text
results_df = pd.DataFrame() # creates empty dataframe
results_df['col1'] = url_list # adds column to dataframe with URLs
results_df['col2'] = details_list # adds column to dataframe with details