从前两页抓取网页内容并使用 python 和 BS4 将抓取的数据导出到 csv
Scraping web contents from first two page and export scraped data to csv using python and BS4
我是 python 的新手,正在使用 Python 3.6.2,我正在尝试使用特定关键字从前 2 页抓取数据。到目前为止,我能够将数据放入 Python IDLE window,但我在将数据导出到 CSV.I 时遇到困难,已尝试使用 BeautifulSoup 4 和 pandas 但无法导出。这是到目前为止我所做的。任何帮助将非常感激。
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "http://www.amazon.in/s/ref=nb_sb_noss?url=search-
alias%3Dautomotive&field-
keywords=helmets+for+men&rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ajr=0"
request = requests.get(url)
soup = BeautifulSoup(request.content, "lxml")
#filename = auto.csv
#with open(str(auto.csv,"r+","\n")) as csvfile:
#headers = "Count , Asin \n"
#fo.writer(headers)
for url in soup.find_all('li'):
Nand = url.get('data-asin')
#print(Nand)
Result = url.get('id')
#print(Result)
#d=(str(Nand), str(Result))
df=pd.Index(url.get_attribute('url'))
#with open("auto.txt", "w",newline='') as dumpfile:
#dumpfilewriter = csv.writer(dumpfile)
#for Nand in soup:
#value = Nand.__gt__
#if value:
#dumpfilewriter.writerows([value])
df.to_csv(dumpfile)
dumpfile.close()
csvfile.csv.writer("auto.csv," , ',' ,'|' , "\n")
Question: Help me with exporting the data of variable "Nand" and "Result" to csv file
with open("auto.csv", 'w') as fh:
writer = csv.DictWriter(fh, fieldnames=['Nand', 'Result'])
writer.writeheader()
data = {}
for url in soup.find_all('li'):
data['Nand'] = url.get('data-asin')
data['Result'] = url.get('id')
writer.writerow(data)
使用 Python 测试:3.4.2
我在站点请求中添加了 user-agent
以逃避自动阻止机器人程序。你有很多 None
因为你没有指定你想要的 <li>
标签。我也将它添加到代码中。
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "http://www.amazon.in/s/ref=nb_sb_noss?url=search-alias%3Dautomotive&field-keywords=helmets+for+men&rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ajr=0"
request = requests.get(url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
soup = BeautifulSoup(request.content, "lxml")
res = []
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Nand', 'Result'])
df.to_csv('path/where/you/want/to/store/file.csv')
EDIT:为了处理所有页面,您需要构建一个生成 urls 的循环,然后将其传递给主处理块(您已经拥有) .查看此页面:http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&page=2&keywords=helmets+for+men&ie=UTF8&qid=1501133688&spIA=B01N0MAT2E,B01MY1ZZDS,B01N0RMJ1H
。
EDIT_2:让我们循环 page
参数。您可以手动将 page
添加到 url,然后将其传递给 requests.get()
。
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&keywords=helmets+for+men&ie=UTF8"
#excluding page from base_url for further adding
res = []
for page in range(1,72): # such range is because last page for needed category is 71
request = requests.get(base_url + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Nand', 'Result'])
df.to_csv('path/where/you/want/to/store/file.csv')
我是 python 的新手,正在使用 Python 3.6.2,我正在尝试使用特定关键字从前 2 页抓取数据。到目前为止,我能够将数据放入 Python IDLE window,但我在将数据导出到 CSV.I 时遇到困难,已尝试使用 BeautifulSoup 4 和 pandas 但无法导出。这是到目前为止我所做的。任何帮助将非常感激。
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "http://www.amazon.in/s/ref=nb_sb_noss?url=search-
alias%3Dautomotive&field-
keywords=helmets+for+men&rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ajr=0"
request = requests.get(url)
soup = BeautifulSoup(request.content, "lxml")
#filename = auto.csv
#with open(str(auto.csv,"r+","\n")) as csvfile:
#headers = "Count , Asin \n"
#fo.writer(headers)
for url in soup.find_all('li'):
Nand = url.get('data-asin')
#print(Nand)
Result = url.get('id')
#print(Result)
#d=(str(Nand), str(Result))
df=pd.Index(url.get_attribute('url'))
#with open("auto.txt", "w",newline='') as dumpfile:
#dumpfilewriter = csv.writer(dumpfile)
#for Nand in soup:
#value = Nand.__gt__
#if value:
#dumpfilewriter.writerows([value])
df.to_csv(dumpfile)
dumpfile.close()
csvfile.csv.writer("auto.csv," , ',' ,'|' , "\n")
Question: Help me with exporting the data of variable "Nand" and "Result" to csv file
with open("auto.csv", 'w') as fh:
writer = csv.DictWriter(fh, fieldnames=['Nand', 'Result'])
writer.writeheader()
data = {}
for url in soup.find_all('li'):
data['Nand'] = url.get('data-asin')
data['Result'] = url.get('id')
writer.writerow(data)
使用 Python 测试:3.4.2
我在站点请求中添加了 user-agent
以逃避自动阻止机器人程序。你有很多 None
因为你没有指定你想要的 <li>
标签。我也将它添加到代码中。
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "http://www.amazon.in/s/ref=nb_sb_noss?url=search-alias%3Dautomotive&field-keywords=helmets+for+men&rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ajr=0"
request = requests.get(url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
soup = BeautifulSoup(request.content, "lxml")
res = []
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Nand', 'Result'])
df.to_csv('path/where/you/want/to/store/file.csv')
EDIT:为了处理所有页面,您需要构建一个生成 urls 的循环,然后将其传递给主处理块(您已经拥有) .查看此页面:http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&page=2&keywords=helmets+for+men&ie=UTF8&qid=1501133688&spIA=B01N0MAT2E,B01MY1ZZDS,B01N0RMJ1H
。
EDIT_2:让我们循环 page
参数。您可以手动将 page
添加到 url,然后将其传递给 requests.get()
。
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&keywords=helmets+for+men&ie=UTF8"
#excluding page from base_url for further adding
res = []
for page in range(1,72): # such range is because last page for needed category is 71
request = requests.get(base_url + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Nand', 'Result'])
df.to_csv('path/where/you/want/to/store/file.csv')