For 循环 - 运行 1 个循环完成然后 运行 下一个循环 python
For loop - running 1 loop to completion then running next loop python
此脚本需要 运行 通过 RI_page_urls.csv,然后 运行 通过 RI_License_urls.csv 的所有结果 url 并抢占业务信息。
它从 RI_page_urls.csv 中提取所有 url,但仅 运行 并打印来自 RI_License_urls.csv 的 100 个 url 中的第一个.需要帮助弄清楚如何让它在 运行 宁第二部分之前等待第一部分完成。
感谢所有帮助。
这是 RI_page_urls.csv 开头的 url:
http://www.crb.state.ri.us/verify_CRB.php
和代码:
from bs4 import BeautifulSoup as soup
import requests as r
import pandas as pd
import re
import csv
#pulls lic# url
with open('RI_page_urls.csv') as f_input:
csv_input = csv.reader(f_input)
for url in csv_input:
data = r.get(url[0])
page_data = soup(data.text, 'html.parser')
links = [r'www.crb.state.ri.us/' + link['href']
for link in page_data.table.tr.find_all('a') if re.search('licensedetail.php', str(link))]
df = pd.DataFrame(links)
df.to_csv('RI_License_urls.csv', header=False, index=False, mode = 'a')
#Code Above works!
#need to pull table info from license url
#this pulls the first record, but doesn't loop through the requests
with open('RI_License_urls.csv') as f_input_2:
csv_input_2 = csv.reader(f_input_2)
for url in csv_input_2:
data = r.get(url[0])
page_data = soup(data.text, 'html.parser')
company_info = (' '.join(info.get_text(", ", strip=True).split()) for info in page_data.find_all('h9'))
df = pd.DataFrame(info, columns=['company_info'])
df.to_csv('RI_company_info.csv', index=False)
好吧,这个问题有点不清楚,代码也有一些错误
data = r.get(url[0])
应该是因为它的 url 以 http 或 https 开头而不是 www
data = r.get("http://"+url[0])
在下面的代码中,
info
没有定义,我只是假设它应该是 company_info
company_info = (' '.join(info.get_text(", ", strip=True).split()) for info in page_data.find_all('h9'))
df = pd.DataFrame(info, columns=['company_info'])
因此完整的代码是
from bs4 import BeautifulSoup as soup
import requests as r
import pandas as pd
import re
import csv
#pulls lic# url
with open('RI_page_urls.csv') as f_input:
csv_input = csv.reader(f_input)
for url in csv_input:
data = r.get(url[0])
page_data = soup(data.text, 'html.parser')
links = [r'www.crb.state.ri.us/' + link['href']
for link in page_data.table.tr.find_all('a') if re.search('licensedetail.php', str(link))]
df = pd.DataFrame(links)
df.to_csv('RI_License_urls.csv', header=False, index=False, mode = 'a')
#Code Above works!
#need to pull table info from license url
#this pulls the first record, but doesn't loop through the requests
with open('RI_License_urls.csv') as f_input_2:
csv_input_2 = csv.reader(f_input_2)
with open('RI_company_info.csv','a',buffering=0) as companyinfofiledescriptor:
for url in csv_input_2:
data = r.get("http://"+url[0])
page_data = soup(data.text, 'html.parser')
company_info = (' '.join(info.get_text(", ", strip=True).split()) for info in page_data.find_all('h9'))
df = pd.DataFrame(company_info, columns=['company_info'])
df.to_csv(companyinfofiledescriptor, index=False)
print(df)
此脚本需要 运行 通过 RI_page_urls.csv,然后 运行 通过 RI_License_urls.csv 的所有结果 url 并抢占业务信息。
它从 RI_page_urls.csv 中提取所有 url,但仅 运行 并打印来自 RI_License_urls.csv 的 100 个 url 中的第一个.需要帮助弄清楚如何让它在 运行 宁第二部分之前等待第一部分完成。
感谢所有帮助。
这是 RI_page_urls.csv 开头的 url:
http://www.crb.state.ri.us/verify_CRB.php
和代码:
from bs4 import BeautifulSoup as soup
import requests as r
import pandas as pd
import re
import csv
#pulls lic# url
with open('RI_page_urls.csv') as f_input:
csv_input = csv.reader(f_input)
for url in csv_input:
data = r.get(url[0])
page_data = soup(data.text, 'html.parser')
links = [r'www.crb.state.ri.us/' + link['href']
for link in page_data.table.tr.find_all('a') if re.search('licensedetail.php', str(link))]
df = pd.DataFrame(links)
df.to_csv('RI_License_urls.csv', header=False, index=False, mode = 'a')
#Code Above works!
#need to pull table info from license url
#this pulls the first record, but doesn't loop through the requests
with open('RI_License_urls.csv') as f_input_2:
csv_input_2 = csv.reader(f_input_2)
for url in csv_input_2:
data = r.get(url[0])
page_data = soup(data.text, 'html.parser')
company_info = (' '.join(info.get_text(", ", strip=True).split()) for info in page_data.find_all('h9'))
df = pd.DataFrame(info, columns=['company_info'])
df.to_csv('RI_company_info.csv', index=False)
好吧,这个问题有点不清楚,代码也有一些错误
data = r.get(url[0])
应该是因为它的 url 以 http 或 https 开头而不是 www
data = r.get("http://"+url[0])
在下面的代码中,
info
没有定义,我只是假设它应该是 company_info
company_info = (' '.join(info.get_text(", ", strip=True).split()) for info in page_data.find_all('h9'))
df = pd.DataFrame(info, columns=['company_info'])
因此完整的代码是
from bs4 import BeautifulSoup as soup
import requests as r
import pandas as pd
import re
import csv
#pulls lic# url
with open('RI_page_urls.csv') as f_input:
csv_input = csv.reader(f_input)
for url in csv_input:
data = r.get(url[0])
page_data = soup(data.text, 'html.parser')
links = [r'www.crb.state.ri.us/' + link['href']
for link in page_data.table.tr.find_all('a') if re.search('licensedetail.php', str(link))]
df = pd.DataFrame(links)
df.to_csv('RI_License_urls.csv', header=False, index=False, mode = 'a')
#Code Above works!
#need to pull table info from license url
#this pulls the first record, but doesn't loop through the requests
with open('RI_License_urls.csv') as f_input_2:
csv_input_2 = csv.reader(f_input_2)
with open('RI_company_info.csv','a',buffering=0) as companyinfofiledescriptor:
for url in csv_input_2:
data = r.get("http://"+url[0])
page_data = soup(data.text, 'html.parser')
company_info = (' '.join(info.get_text(", ", strip=True).split()) for info in page_data.find_all('h9'))
df = pd.DataFrame(company_info, columns=['company_info'])
df.to_csv(companyinfofiledescriptor, index=False)
print(df)