python 中的循环问题 - 相同数据的多个条目
Problem with loop in python - Multiple entries of same data
我正在用 python 附加一个 .csv 文件。数据是从网上抓取的。我几乎完成了所有与抓取有关的事情。
当我试图追加文件时,问题来了。它输入多个 >100s 的相同数据条目。所以 我确定循环/for 或 if 语句存在我无法识别和解决的问题。
该条件检查从网络抓取的数据与文件中已有数据的相似性。
如果数据不匹配,则程序写入新行,否则中断或继续。
注意:csvFileArray 是一个数组,用于检查现有 file.txt 中的数据。例如 print(csvFileArray[0])
给出:
{'Date': '19/05/21', 'Time': '14:51:00', 'Status': 'Waitlisted', 'School': 'MIT Sloan', 'Details': 'GPA: 3.4 Round: Round 2 | Texas'}
下面是有问题的代码。
file = open('file.csv', 'a')
writer = csv.writer(file)
#loop for page numbers
for page in range(15, 17):
print("Getting page {}..".format(page))
params["paged"] = page
data = requests.post(url, data=params).json()
soup = BeautifulSoup(data["markup"], "html.parser")
for entry in soup.select(".livewire-entry"):
datime = entry.select_one(".adate")
status = entry.select_one(".status")
name = status.find_next("strong")
details = entry.select_one(".lw-details")
datime = datime.get_text(strip=True)
datime = datetime.datetime.strptime(datime, '%B %d, %Y %I:%M%p')
time = datime.time() #returns time
date = datime.date() #returns date
for firstentry in csvFileArray:
condition = (((firstentry['Date']) == date) and ((firstentry['Time']) == time)
and ((firstentry['Status']) == (status.get_text(strip=True))) and ((firstentry['School']) == (name.get_text(strip=True)))
and ((firstentry['Details']) == details.get_text(strip=True)))
if condition:
continue
else:
writer.writerow([date, time, status.get_text(strip=True), name.get_text(strip=True),details.get_text(strip=True)])
#print('ok')
print("-" * 80)
file.close()
我猜您只想在所有 csvFileArray
条目的条件都为真时才写该行。现在,您正在为每个不匹配的 csvFileArray
编写它。
for entry in soup.select(".livewire-entry"):
datime = entry.select_one(".adate")
status = entry.select_one(".status")
name = status.find_next("strong")
details = entry.select_one(".lw-details")
datime = datime.get_text(strip=True)
datime = datetime.datetime.strptime(datime, '%B %d, %Y %I:%M%p')
time = datime.time() #returns time
date = datime.date() #returns date
should_write = True
for firstentry in csvFileArray:
if (((firstentry['Date']) == date) and ((firstentry['Time']) == time)
and ((firstentry['Status']) == (status.get_text(strip=True))) and ((firstentry['School']) == (name.get_text(strip=True)))
and ((firstentry['Details']) == details.get_text(strip=True))):
should_write = False
break
if should_write:
writer.writerow([date, time, status.get_text(strip=True), name.get_text(strip=True),details.get_text(strip=True)])
#print('ok')
您也可以为此使用列表理解,但由于您的条件很大,因此很难阅读:
if not any(
(((firstentry['Date']) == date) and ((firstentry['Time']) == time)
and ((firstentry['Status']) == (status.get_text(strip=True))) and ((firstentry['School']) == (name.get_text(strip=True)))
and ((firstentry['Details']) == details.get_text(strip=True)))
for firstentry in csvFileArray):
writer.writerow([date, time, status.get_text(strip=True), name.get_text(strip=True),details.get_text(strip=True)])
#print('ok')
我正在用 python 附加一个 .csv 文件。数据是从网上抓取的。我几乎完成了所有与抓取有关的事情。
当我试图追加文件时,问题来了。它输入多个 >100s 的相同数据条目。所以 我确定循环/for 或 if 语句存在我无法识别和解决的问题。
该条件检查从网络抓取的数据与文件中已有数据的相似性。 如果数据不匹配,则程序写入新行,否则中断或继续。
注意:csvFileArray 是一个数组,用于检查现有 file.txt 中的数据。例如 print(csvFileArray[0])
给出:
{'Date': '19/05/21', 'Time': '14:51:00', 'Status': 'Waitlisted', 'School': 'MIT Sloan', 'Details': 'GPA: 3.4 Round: Round 2 | Texas'}
下面是有问题的代码。
file = open('file.csv', 'a')
writer = csv.writer(file)
#loop for page numbers
for page in range(15, 17):
print("Getting page {}..".format(page))
params["paged"] = page
data = requests.post(url, data=params).json()
soup = BeautifulSoup(data["markup"], "html.parser")
for entry in soup.select(".livewire-entry"):
datime = entry.select_one(".adate")
status = entry.select_one(".status")
name = status.find_next("strong")
details = entry.select_one(".lw-details")
datime = datime.get_text(strip=True)
datime = datetime.datetime.strptime(datime, '%B %d, %Y %I:%M%p')
time = datime.time() #returns time
date = datime.date() #returns date
for firstentry in csvFileArray:
condition = (((firstentry['Date']) == date) and ((firstentry['Time']) == time)
and ((firstentry['Status']) == (status.get_text(strip=True))) and ((firstentry['School']) == (name.get_text(strip=True)))
and ((firstentry['Details']) == details.get_text(strip=True)))
if condition:
continue
else:
writer.writerow([date, time, status.get_text(strip=True), name.get_text(strip=True),details.get_text(strip=True)])
#print('ok')
print("-" * 80)
file.close()
我猜您只想在所有 csvFileArray
条目的条件都为真时才写该行。现在,您正在为每个不匹配的 csvFileArray
编写它。
for entry in soup.select(".livewire-entry"):
datime = entry.select_one(".adate")
status = entry.select_one(".status")
name = status.find_next("strong")
details = entry.select_one(".lw-details")
datime = datime.get_text(strip=True)
datime = datetime.datetime.strptime(datime, '%B %d, %Y %I:%M%p')
time = datime.time() #returns time
date = datime.date() #returns date
should_write = True
for firstentry in csvFileArray:
if (((firstentry['Date']) == date) and ((firstentry['Time']) == time)
and ((firstentry['Status']) == (status.get_text(strip=True))) and ((firstentry['School']) == (name.get_text(strip=True)))
and ((firstentry['Details']) == details.get_text(strip=True))):
should_write = False
break
if should_write:
writer.writerow([date, time, status.get_text(strip=True), name.get_text(strip=True),details.get_text(strip=True)])
#print('ok')
您也可以为此使用列表理解,但由于您的条件很大,因此很难阅读:
if not any(
(((firstentry['Date']) == date) and ((firstentry['Time']) == time)
and ((firstentry['Status']) == (status.get_text(strip=True))) and ((firstentry['School']) == (name.get_text(strip=True)))
and ((firstentry['Details']) == details.get_text(strip=True)))
for firstentry in csvFileArray):
writer.writerow([date, time, status.get_text(strip=True), name.get_text(strip=True),details.get_text(strip=True)])
#print('ok')