Beautifulsoup python 错误(10060)紧急
Beautifulsoup python error(10060)Emergency
我是编程新手,我在 python 中使用漂亮的汤创建了一个网络爬虫但是当我 运行 这个程序它打开 python 命令行并且只是光标闪烁它并没有任何反应......现在我收到这些错误
TimeoutError: [WinError 10060]连接尝试失败,因为连接方在一段时间后没有正确响应,或者建立连接失败,因为连接的主机没有响应
ConnectionResetError: [WinError 10054] 现有连接被远程主机强行关闭
...请不要介意缩进,
下面是我的代码:
import urllib.request
import urllib
import json
import xml.etree.ElementTree as ET
import csv
from bs4 import BeautifulSoup
link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist'
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka"
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict"
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName"
alldata = []
links = {}
certificatedata = []
def getData(url, values):
data = urllib.parse.urlencode(values)
data = data.encode('utf-8')
req = urllib.request.Request(url, data)
response=urllib.request.urlopen(req)
data = response.read()
data = data.decode("utf-8")
return data
def getDivsion():
## for now we are taking 6 districts.. it needs to updated when the data
gets updatedd
return range(1,7)
def getDistrict(divId):
global distlink
values = {'DivID': divId}
data = getData(distlink, values)
return data
def parseJson(data):
parsed = json.loads(data)
return parsed
def getTaluka(disId):
global talukaLink
values= {'DisID': disId}
data = getData(talukaLink, values)
return data
def getProjects(divId, disId):
global prjLink
values= {'DisID': disId, 'DivID': divId}
#print(values)
data = getData( prjLink, values)
if len(data)<10:
return "{}"
return data
def getProjectsList():
divList = getDivsion()
flag = 0
for divId in divList:
disData = getDistrict(divId)
disList = parseJson(disData)
for disObj in disList:
disId = disObj["ID"]
prjData = getProjects(divId, disId)
#print(" >>>> "+str(disId)+" >> "+str(divId))
#print(prjData)
prjJson = parseJson(prjData)
for prjObj in prjJson:
flag += 1
prjId = prjObj["ID"]
values = {'ID':0, 'pageTraverse': 1, 'Division': divId, 'hdnDistrict': '', 'hdnProject':'', 'District': disId, 'Taluka':'', 'Village': '', 'Project': prjId, 'CertiNo':'', 'btnSearch':'Search'}
finalPrjData = getData(link, values)
parseXMLData(finalPrjData)
#if len(alldata)>100:
# break
def parseXMLData(htmldata):
global alldata, links
soup = BeautifulSoup(htmldata, "html.parser")
tables = soup.find_all("table")
for table in tables:
print(len(alldata))
attr = table.attrs
if "table" in attr['class']:
tbody = table.find_all("tbody")
if len(tbody)>0:
tbody = tbody[0]
tr_lst = tbody.find_all("tr")
for tr in tr_lst:
sublist = []
td_lst = tr.find_all("td")
if len(td_lst)>6:
prjname = td_lst[1].text
proname = td_lst[2].text
certNo = td_lst[3].text
sublist.append(prjname)
sublist.append(proname)
sublist.append(certNo)
td = td_lst[4]
a_lst = td.find_all("a")
if len(a_lst)>0:
a = a_lst[0]
href = a.attrs['href']
link = "https://maharerait.mahaonline.gov.in/"+href
links[certNo] = link
sublist.append(link)
if len(sublist)>0:
alldata.append(sublist)
return alldata
def writedata(alldata1, filename):
print(" >>>> FINAL PRINTING DATA >>>> ")
#import pdb; pdb.set_trace()
with open("./"+filename,'w') as csvfile:
csvfile = csv.writer(csvfile, delimiter=',')
#csvfile.writerow(titleRow)
csvfile.writerow("")
for i in range(0, len( alldata1 )):
#print(alldata1[i])
csvfile.writerow( alldata1[i] )
def processlinksforcert():
global links, certificatedata
print(">> Came in fetching certificates data >>> " )
for certno in links.keys():
link = links[certno]
htmldata = getData(link, {})
soup = BeautifulSoup(htmldata, "html.parser")
divs = soup.find_all("div")
for div in divs:
attr = div.attrs
if "id" in attr.keys() and "DivProfessional" in attr['id']:
table = div.find_all("table")
if len(table)<=0:
continue
t_attr = table[0].attrs
if "table" in t_attr["class"]:
print(len(certificatedata))
table = table[0]
tr_lst = table.find_all("tr")
index = 1
while index<len(tr_lst):
#import pdb; pdb.set_trace()
#for tr in tr_lst:
#if index==0:
# continue
tr = tr_lst[index]
index += 1
sublist = []
td_lst = tr.find_all("td")
if len(td_lst)>2:
sublist.append(certno)
pername = formattext( td_lst[0].text)
cerno = formattext( td_lst[1].text )
proftype = formattext( td_lst[2].text )
sublist.append(pername)
sublist.append(cerno)
sublist.append(proftype)
certificatedata.append(sublist)
return certificatedata
def formattext(text):
while text.find("\r\n")>=0:
text = text.replace("\r\n","")
while text.find(" ")>=0:
text = text.replace(" ","")
return text
def main():
global alldata, certificatedata
#data = getData(url, {})
getProjectsList()
print("Before write the projects data to the file. Count >>
"+str(len(alldata)))
writedata(alldata, "data.csv")
data = processlinksforcert()
print("Before write the certificates data to the file. Count >>
"+str(len(data)))
writedata( data, "certificates.csv" )
main()
有人可以告诉我我做错了什么吗...我已经安装了 pip 和 pip beautifulsoup 也...请不要介意缩进,它只是在这里....
我用selenium解决了。非常感谢大家
我是编程新手,我在 python 中使用漂亮的汤创建了一个网络爬虫但是当我 运行 这个程序它打开 python 命令行并且只是光标闪烁它并没有任何反应......现在我收到这些错误
TimeoutError: [WinError 10060]连接尝试失败,因为连接方在一段时间后没有正确响应,或者建立连接失败,因为连接的主机没有响应
ConnectionResetError: [WinError 10054] 现有连接被远程主机强行关闭
...请不要介意缩进,
下面是我的代码:
import urllib.request
import urllib
import json
import xml.etree.ElementTree as ET
import csv
from bs4 import BeautifulSoup
link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist'
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka"
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict"
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName"
alldata = []
links = {}
certificatedata = []
def getData(url, values):
data = urllib.parse.urlencode(values)
data = data.encode('utf-8')
req = urllib.request.Request(url, data)
response=urllib.request.urlopen(req)
data = response.read()
data = data.decode("utf-8")
return data
def getDivsion():
## for now we are taking 6 districts.. it needs to updated when the data
gets updatedd
return range(1,7)
def getDistrict(divId):
global distlink
values = {'DivID': divId}
data = getData(distlink, values)
return data
def parseJson(data):
parsed = json.loads(data)
return parsed
def getTaluka(disId):
global talukaLink
values= {'DisID': disId}
data = getData(talukaLink, values)
return data
def getProjects(divId, disId):
global prjLink
values= {'DisID': disId, 'DivID': divId}
#print(values)
data = getData( prjLink, values)
if len(data)<10:
return "{}"
return data
def getProjectsList():
divList = getDivsion()
flag = 0
for divId in divList:
disData = getDistrict(divId)
disList = parseJson(disData)
for disObj in disList:
disId = disObj["ID"]
prjData = getProjects(divId, disId)
#print(" >>>> "+str(disId)+" >> "+str(divId))
#print(prjData)
prjJson = parseJson(prjData)
for prjObj in prjJson:
flag += 1
prjId = prjObj["ID"]
values = {'ID':0, 'pageTraverse': 1, 'Division': divId, 'hdnDistrict': '', 'hdnProject':'', 'District': disId, 'Taluka':'', 'Village': '', 'Project': prjId, 'CertiNo':'', 'btnSearch':'Search'}
finalPrjData = getData(link, values)
parseXMLData(finalPrjData)
#if len(alldata)>100:
# break
def parseXMLData(htmldata):
global alldata, links
soup = BeautifulSoup(htmldata, "html.parser")
tables = soup.find_all("table")
for table in tables:
print(len(alldata))
attr = table.attrs
if "table" in attr['class']:
tbody = table.find_all("tbody")
if len(tbody)>0:
tbody = tbody[0]
tr_lst = tbody.find_all("tr")
for tr in tr_lst:
sublist = []
td_lst = tr.find_all("td")
if len(td_lst)>6:
prjname = td_lst[1].text
proname = td_lst[2].text
certNo = td_lst[3].text
sublist.append(prjname)
sublist.append(proname)
sublist.append(certNo)
td = td_lst[4]
a_lst = td.find_all("a")
if len(a_lst)>0:
a = a_lst[0]
href = a.attrs['href']
link = "https://maharerait.mahaonline.gov.in/"+href
links[certNo] = link
sublist.append(link)
if len(sublist)>0:
alldata.append(sublist)
return alldata
def writedata(alldata1, filename):
print(" >>>> FINAL PRINTING DATA >>>> ")
#import pdb; pdb.set_trace()
with open("./"+filename,'w') as csvfile:
csvfile = csv.writer(csvfile, delimiter=',')
#csvfile.writerow(titleRow)
csvfile.writerow("")
for i in range(0, len( alldata1 )):
#print(alldata1[i])
csvfile.writerow( alldata1[i] )
def processlinksforcert():
global links, certificatedata
print(">> Came in fetching certificates data >>> " )
for certno in links.keys():
link = links[certno]
htmldata = getData(link, {})
soup = BeautifulSoup(htmldata, "html.parser")
divs = soup.find_all("div")
for div in divs:
attr = div.attrs
if "id" in attr.keys() and "DivProfessional" in attr['id']:
table = div.find_all("table")
if len(table)<=0:
continue
t_attr = table[0].attrs
if "table" in t_attr["class"]:
print(len(certificatedata))
table = table[0]
tr_lst = table.find_all("tr")
index = 1
while index<len(tr_lst):
#import pdb; pdb.set_trace()
#for tr in tr_lst:
#if index==0:
# continue
tr = tr_lst[index]
index += 1
sublist = []
td_lst = tr.find_all("td")
if len(td_lst)>2:
sublist.append(certno)
pername = formattext( td_lst[0].text)
cerno = formattext( td_lst[1].text )
proftype = formattext( td_lst[2].text )
sublist.append(pername)
sublist.append(cerno)
sublist.append(proftype)
certificatedata.append(sublist)
return certificatedata
def formattext(text):
while text.find("\r\n")>=0:
text = text.replace("\r\n","")
while text.find(" ")>=0:
text = text.replace(" ","")
return text
def main():
global alldata, certificatedata
#data = getData(url, {})
getProjectsList()
print("Before write the projects data to the file. Count >>
"+str(len(alldata)))
writedata(alldata, "data.csv")
data = processlinksforcert()
print("Before write the certificates data to the file. Count >>
"+str(len(data)))
writedata( data, "certificates.csv" )
main()
有人可以告诉我我做错了什么吗...我已经安装了 pip 和 pip beautifulsoup 也...请不要介意缩进,它只是在这里....
我用selenium解决了。非常感谢大家