Web 抓取 python 阿拉伯语文本
Web scraping python for Arabic text
我正在尝试通过网络抓取网站:“http://norumors.net/?post_type=rumors?post_type=rumors”以仅获取标题新闻并将其放入它们在使用 Beautifulsoup 和 python 的 CSV 文件中,这是我在查看 HTML 源代码后使用的代码“view-source:http://norumors.net/?post_type=rumors?post_type=rumors"
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd
pagesToGet= 1
upperframe=[]
for page in range(1,pagesToGet+1):
print('processing page :', page)
url = 'http://norumors.net/?post_type=rumors/?page='+str(page)
print(url)
#an exception might be thrown, so the code should be in a try-except block
try:
#use the browser to get the url. This is suspicious command that might blow up.
page=requests.get(url) # this might throw an exception if something goes wrong.
except Exception as e: # this describes what to do if an exception is thrown
error_type, error_obj, error_info = sys.exc_info() # get the exception information
print ('ERROR FOR LINK:',url) #print the link that cause the problem
print (error_type, 'Line:', error_info.tb_lineno) #print error info and line that threw the exception
continue #ignore this page. Abandon this and go back.
time.sleep(2)
soup=BeautifulSoup(page.text,'html.parser')
frame=[]
links=soup.find_all('li',attrs={'class':'o-listicle__item'})
print(len(links))
filename="NEWS.csv"
f=open(filename,"w", encoding = 'utf-8')
headers="Statement,Link\n"
f.write(headers)
for j in links:
Statement = j.find("div",attrs={'class':'row d-flex'}).text.strip()
# Link = "http://norumors.net/"
Link += j.find("div",attrs={'class':'col-lg-4 col-md-4 col-sm-6 col-xs-6'}).find('a')['href'].strip()
frame.append((Statement,Link))
f.write(Statement.replace(",","^")+","+Link+","+Date.replace(",","^")+","+Source.replace(",","^")+","+Label.replace(",","^")+"\n")
upperframe.extend(frame)
f.close()
data=pd.DataFrame(upperframe, columns=['Statement','Link'])
data.head()
但是在我 运行 代码之后,我得到 pandas 数据框和 CSV 文件为空,有什么建议是为什么吗?知道我想获取标签之间的文本。
如果我没理解错的话,您想获取新闻标题的文本部分和这些新闻的 href link。您还想将它们写入 CSV 文件。您的代码的问题是 for j in links:
未执行,因为 soup.find_all('li',attrs={'class':'o-listicle__item'})
returns 是一个空列表。您应该注意要搜索的标签的名称和 类。下面的代码获取新闻文本及其 links,它还使用 pd.DataFrame
.
将它们写入 CSV 文件
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd
pagesToGet = 1
for page in range(1,pagesToGet+1):
print('processing page :', page)
url = 'http://norumors.net/?post_type=rumors/?page=' + str(page)
print(url)
#an exception might be thrown, so the code should be in a try-except block
try:
#use the browser to get the url. This is suspicious command that might blow up.
page = requests.get(url) # this might throw an exception if something goes wrong.
except Exception as e: # this describes what to do if an exception is thrown
error_type, error_obj, error_info = sys.exc_info() # get the exception information
print('ERROR FOR LINK:',url) #print the link that cause the problem
print(error_type, 'Line:', error_info.tb_lineno) #print error info and line that threw the exception
continue #ignore this page. Abandon this and go back.
soup = BeautifulSoup(page.text,'html.parser')
texts = []
links = []
filename = "NEWS.csv"
f = open(filename,"w", encoding = 'utf-8')
Statement = soup.find("div",attrs={'class':'row d-flex'})
divs = Statement.find_all("div",attrs={'class':'col-lg-4 col-md-4 col-sm-6 col-xs-6'})
for div in divs:
txt = div.find("img",attrs={'class':'rumor__thumb'})
texts.append(txt['alt'])
lnk = div.find("a",attrs={'class':'rumor--archive'})
links.append(lnk['href'])
data = pd.DataFrame(list(zip(texts, links)), columns=['Statement', 'Link'])
data.to_csv(f, encoding='utf-8', index=False)
f.close()
我正在尝试通过网络抓取网站:“http://norumors.net/?post_type=rumors?post_type=rumors”以仅获取标题新闻并将其放入它们在使用 Beautifulsoup 和 python 的 CSV 文件中,这是我在查看 HTML 源代码后使用的代码“view-source:http://norumors.net/?post_type=rumors?post_type=rumors"
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd
pagesToGet= 1
upperframe=[]
for page in range(1,pagesToGet+1):
print('processing page :', page)
url = 'http://norumors.net/?post_type=rumors/?page='+str(page)
print(url)
#an exception might be thrown, so the code should be in a try-except block
try:
#use the browser to get the url. This is suspicious command that might blow up.
page=requests.get(url) # this might throw an exception if something goes wrong.
except Exception as e: # this describes what to do if an exception is thrown
error_type, error_obj, error_info = sys.exc_info() # get the exception information
print ('ERROR FOR LINK:',url) #print the link that cause the problem
print (error_type, 'Line:', error_info.tb_lineno) #print error info and line that threw the exception
continue #ignore this page. Abandon this and go back.
time.sleep(2)
soup=BeautifulSoup(page.text,'html.parser')
frame=[]
links=soup.find_all('li',attrs={'class':'o-listicle__item'})
print(len(links))
filename="NEWS.csv"
f=open(filename,"w", encoding = 'utf-8')
headers="Statement,Link\n"
f.write(headers)
for j in links:
Statement = j.find("div",attrs={'class':'row d-flex'}).text.strip()
# Link = "http://norumors.net/"
Link += j.find("div",attrs={'class':'col-lg-4 col-md-4 col-sm-6 col-xs-6'}).find('a')['href'].strip()
frame.append((Statement,Link))
f.write(Statement.replace(",","^")+","+Link+","+Date.replace(",","^")+","+Source.replace(",","^")+","+Label.replace(",","^")+"\n")
upperframe.extend(frame)
f.close()
data=pd.DataFrame(upperframe, columns=['Statement','Link'])
data.head()
但是在我 运行 代码之后,我得到 pandas 数据框和 CSV 文件为空,有什么建议是为什么吗?知道我想获取标签之间的文本。
如果我没理解错的话,您想获取新闻标题的文本部分和这些新闻的 href link。您还想将它们写入 CSV 文件。您的代码的问题是 for j in links:
未执行,因为 soup.find_all('li',attrs={'class':'o-listicle__item'})
returns 是一个空列表。您应该注意要搜索的标签的名称和 类。下面的代码获取新闻文本及其 links,它还使用 pd.DataFrame
.
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd
pagesToGet = 1
for page in range(1,pagesToGet+1):
print('processing page :', page)
url = 'http://norumors.net/?post_type=rumors/?page=' + str(page)
print(url)
#an exception might be thrown, so the code should be in a try-except block
try:
#use the browser to get the url. This is suspicious command that might blow up.
page = requests.get(url) # this might throw an exception if something goes wrong.
except Exception as e: # this describes what to do if an exception is thrown
error_type, error_obj, error_info = sys.exc_info() # get the exception information
print('ERROR FOR LINK:',url) #print the link that cause the problem
print(error_type, 'Line:', error_info.tb_lineno) #print error info and line that threw the exception
continue #ignore this page. Abandon this and go back.
soup = BeautifulSoup(page.text,'html.parser')
texts = []
links = []
filename = "NEWS.csv"
f = open(filename,"w", encoding = 'utf-8')
Statement = soup.find("div",attrs={'class':'row d-flex'})
divs = Statement.find_all("div",attrs={'class':'col-lg-4 col-md-4 col-sm-6 col-xs-6'})
for div in divs:
txt = div.find("img",attrs={'class':'rumor__thumb'})
texts.append(txt['alt'])
lnk = div.find("a",attrs={'class':'rumor--archive'})
links.append(lnk['href'])
data = pd.DataFrame(list(zip(texts, links)), columns=['Statement', 'Link'])
data.to_csv(f, encoding='utf-8', index=False)
f.close()