Python 3:如何使用 CSFR 从网站上抓取研究结果?
Python 3: how to scrape research results from a website using CSFR?
我正在尝试抓取一个列出法国众筹金融科技网站的研究成果:https://www.orias.fr/web/guest/search
手动执行,我在单选按钮中 select (IFP),然后它为我提供了 13 个结果页面,每页 10 个结果。每个结果都有一个hyperlink 我还想把信息从里面弄到最后table.
我的主要问题似乎来自CSRF,在结果地址中,有:
p_auth=8mxk0SsK
所以我不能简单地通过在 link 中将 "p=2" 更改为 "p=13" 来遍历结果页面:
https://www.orias.fr/search?p_auth=8mxk0SsK&p_p_id=intermediaryDetailedSearch_WAR_oriasportlet&p_p_lifecycle=1&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_intermediaryDetailedSearch_WAR_oriasportlet_myaction=fullSearch
所以我尝试在python代码中使用它:
import requests
from bs4 import BeautifulSoup
k = 1
% test k from 1 to 13
url = "http://www.orias.fr/search?p_p_id=intermediaryDetailedSearch_WAR_oriasportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_intermediaryDetailedSearch_WAR_oriasportlet_d-16544-p=" + str(k) + "&_intermediaryDetailedSearch_WAR_oriasportlet_implicitModel=true&_intermediaryDetailedSearch_WAR_oriasportlet_spring_render=searchResult"
response = requests.get(url, proxies=proxies) # 200 ment it went through
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find('table', attrs={'class':'table table-condensed table-striped table-bordered'})
table_rows = table.find_all('tr')
l = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text for tr in td]
l.append(row)
它不像在网络浏览器中那样工作,它只是提供一个页面,就好像没有请求任何结果一样。你知道如何让它发挥作用吗?
我会在循环期间更改 post 请求中的页面参数。执行初始请求以找出页数
from bs4 import BeautifulSoup as bs
import requests, re, math
import pandas as pd
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Referer': 'https://www.orias.fr/web/guest/search'
}
params = [['p_p_id', 'intermediaryDetailedSearch_WAR_oriasportlet'],
['p_p_lifecycle', '0'],
['p_p_state', 'normal'],
['p_p_mode', 'view'],
['p_p_col_id', 'column-1'],
['p_p_col_count', '1'],
['_intermediaryDetailedSearch_WAR_oriasportlet_d-16544-p', '1'],
['_intermediaryDetailedSearch_WAR_oriasportlet_implicitModel', 'true'],
['_intermediaryDetailedSearch_WAR_oriasportlet_spring_render', 'searchResult']]
data = {
'searchString': '',
'address': '',
'zipCodeOrCity': '',
'_coa': 'on',
'_aga': 'on',
'_ma': 'on',
'_mia': 'on',
'_euIAS': 'on',
'mandatorDenomination': '',
'wantsMandator': 'no',
'_cobsp': 'on',
'_mobspl': 'on',
'_mobsp': 'on',
'_miobsp': 'on',
'_bankActivities': '1',
'_euIOBSP': 'on',
'_cif': 'on',
'_alpsi': 'on',
'_cip': 'on',
'ifp': 'true',
'_ifp': 'on',
'submit': 'Search'
}
p = re.compile(r'(\d+)\s+intermediaries found')
with requests.Session() as s:
r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
soup = bs(r.content, 'lxml')
num_results = int(p.findall(r.text)[0])
results_per_page = 20
num_pages = math.ceil(num_results/results_per_page)
df = pd.read_html(str(soup.select_one('.table')))[0]
for i in range(2, num_pages + 1):
params[6][1] = str(i)
r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
soup = bs(r.content, 'lxml')
df_next = pd.read_html(str(soup.select_one('.table')))[0]
df = pd.concat([df, df_next])
df.drop('Unnamed: 6', axis = 1, inplace = True)
df.reset_index(drop=True)
检查:
print(len(df['Siren Number'].unique()))
#245
所以这里是完整的代码,同时考虑到 "Each results has a hyperlink I would also like to get information from into the final table"。因此,对于每个公司,我都会更新 headers,然后删除注册或删除的日期。可能有更优雅的方式来呈现代码...
from bs4 import BeautifulSoup as bs
import requests, re, math
import pandas as pd
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Referer': 'https://www.orias.fr/web/guest/search'
}
params = [['p_p_id', 'intermediaryDetailedSearch_WAR_oriasportlet'],
['p_p_lifecycle', '0'],
['p_p_state', 'normal'],
['p_p_mode', 'view'],
['p_p_col_id', 'column-1'],
['p_p_col_count', '1'],
['_intermediaryDetailedSearch_WAR_oriasportlet_d-16544-p', '1'],
['_intermediaryDetailedSearch_WAR_oriasportlet_implicitModel', 'true'],
['_intermediaryDetailedSearch_WAR_oriasportlet_spring_render', 'searchResult']]
data = {
'searchString': '',
'address': '',
'zipCodeOrCity': '',
'_coa': 'on',
'_aga': 'on',
'_ma': 'on',
'_mia': 'on',
'_euIAS': 'on',
'mandatorDenomination': '',
'wantsMandator': 'no',
'_cobsp': 'on',
'_mobspl': 'on',
'_mobsp': 'on',
'_miobsp': 'on',
'_bankActivities': '1',
'_euIOBSP': 'on',
'_cif': 'on',
'_alpsi': 'on',
'_cip': 'on',
'ifp': 'true',
'_ifp': 'on',
'submit': 'Search'
}
p = re.compile(r'(\d+)\s+intermediaries found')
def webdata(soup):
parsed_table = soup.find_all('table')[0]
dataweb = [[td.a['href'] if td.find('a') else
''.join(td.stripped_strings)
for td in row.find_all('td')]
for row in parsed_table.find_all('tr')]
dfweb = pd.DataFrame(dataweb[1:], columns=['SIREN','ID','website','category','zipcode','city','website2'])
dfweb = dfweb.loc[:,['ID','website']]
dfweb.ID = dfweb.ID.astype(int)
return dfweb
with requests.Session() as s:
r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
soup = bs(r.content, 'lxml')
num_results = int(p.findall(r.text)[0])
results_per_page = 20
num_pages = math.ceil(num_results/results_per_page)
df = pd.read_html(str(soup.select_one('.table')))[0]
dfweb = webdata(soup)
df = pd.merge(df,dfweb, on='ID')
for i in range(2, num_pages + 1):
params[6][1] = str(i)
r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
soup = bs(r.content, 'lxml')
df_next = pd.read_html(str(soup.select_one('.table')))[0]
dfweb = webdata(soup)
df_next = pd.merge(df_next ,dfweb, on='ID')
df = pd.concat([df, df_next])
df.drop('Unnamed: 6', axis = 1, inplace = True)
df = df.reset_index(drop=True)
# list the ORIAS identity number given to frims
# get le last 6 character of the link, but last is a space
df['oriasID'] = df.website.apply(lambda x: x[-7:][:6])
# remove = sign, non digit
df['oriasID'] = df.oriasID.apply(lambda y: ''.join(i for i in y if i.isdigit()))
# new parameters
def paramsub(IDi):
return [['p_p_id', 'intermediaryDetailedSearch_WAR_oriasportlet'],
['p_p_lifecycle', '1'],
['p_p_state', 'normal'],
['p_p_mode', 'view'],
['p_p_col_id', 'column-1'],
['p_p_col_count', '1'],
['_intermediaryDetailedSearch_WAR_oriasportlet_myaction', 'viewDetails'],
['_intermediaryDetailedSearch_WAR_oriasportlet_partyId', IDi]]
df['date in'] = False
df['date out'] = False
with requests.Session() as s:
for i in df.index:
IDi = df.loc[i,'oriasID']
r= requests.post('https://www.orias.fr/search', headers=headers, params= paramsub(IDi), data=data)
soup = bs(r.content, 'lxml')
# keep data inside blocint3 is "(IFP)" is in the text
for rowi in soup.find_all('div',{'class':'blocint3'}):
if 'IFP' in rowi.text:
if 'Deleted' in rowi.text:
# identify date
df.loc[i,'date out'] = re.search(r'\d{2}-\d{2}-\d{4}', rowi.text).group()
elif 'Registered' in rowi.text:
df.loc[i,'date in'] = re.search(r'\d{2}-\d{2}-\d{4}', rowi.text).group()
# dates to date format
df['date in'] = pd.to_datetime(df['date in'], format="%d-%m-%Y", errors='coerce')
df['date out'] = pd.to_datetime(df['date out'], format="%d-%m-%Y", errors='coerce')
# sort by dates
df = df.sort_values(by='date out',ascending=True)
df = df.sort_values(by='date in',ascending=True)
df = df.reset_index(drop=True)
# export
df.to_csv('20190817_ORIAS_in_out.csv')
我正在尝试抓取一个列出法国众筹金融科技网站的研究成果:https://www.orias.fr/web/guest/search
手动执行,我在单选按钮中 select (IFP),然后它为我提供了 13 个结果页面,每页 10 个结果。每个结果都有一个hyperlink 我还想把信息从里面弄到最后table.
我的主要问题似乎来自CSRF,在结果地址中,有: p_auth=8mxk0SsK 所以我不能简单地通过在 link 中将 "p=2" 更改为 "p=13" 来遍历结果页面: https://www.orias.fr/search?p_auth=8mxk0SsK&p_p_id=intermediaryDetailedSearch_WAR_oriasportlet&p_p_lifecycle=1&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_intermediaryDetailedSearch_WAR_oriasportlet_myaction=fullSearch
所以我尝试在python代码中使用它:
import requests
from bs4 import BeautifulSoup
k = 1
% test k from 1 to 13
url = "http://www.orias.fr/search?p_p_id=intermediaryDetailedSearch_WAR_oriasportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_intermediaryDetailedSearch_WAR_oriasportlet_d-16544-p=" + str(k) + "&_intermediaryDetailedSearch_WAR_oriasportlet_implicitModel=true&_intermediaryDetailedSearch_WAR_oriasportlet_spring_render=searchResult"
response = requests.get(url, proxies=proxies) # 200 ment it went through
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find('table', attrs={'class':'table table-condensed table-striped table-bordered'})
table_rows = table.find_all('tr')
l = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text for tr in td]
l.append(row)
它不像在网络浏览器中那样工作,它只是提供一个页面,就好像没有请求任何结果一样。你知道如何让它发挥作用吗?
我会在循环期间更改 post 请求中的页面参数。执行初始请求以找出页数
from bs4 import BeautifulSoup as bs
import requests, re, math
import pandas as pd
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Referer': 'https://www.orias.fr/web/guest/search'
}
params = [['p_p_id', 'intermediaryDetailedSearch_WAR_oriasportlet'],
['p_p_lifecycle', '0'],
['p_p_state', 'normal'],
['p_p_mode', 'view'],
['p_p_col_id', 'column-1'],
['p_p_col_count', '1'],
['_intermediaryDetailedSearch_WAR_oriasportlet_d-16544-p', '1'],
['_intermediaryDetailedSearch_WAR_oriasportlet_implicitModel', 'true'],
['_intermediaryDetailedSearch_WAR_oriasportlet_spring_render', 'searchResult']]
data = {
'searchString': '',
'address': '',
'zipCodeOrCity': '',
'_coa': 'on',
'_aga': 'on',
'_ma': 'on',
'_mia': 'on',
'_euIAS': 'on',
'mandatorDenomination': '',
'wantsMandator': 'no',
'_cobsp': 'on',
'_mobspl': 'on',
'_mobsp': 'on',
'_miobsp': 'on',
'_bankActivities': '1',
'_euIOBSP': 'on',
'_cif': 'on',
'_alpsi': 'on',
'_cip': 'on',
'ifp': 'true',
'_ifp': 'on',
'submit': 'Search'
}
p = re.compile(r'(\d+)\s+intermediaries found')
with requests.Session() as s:
r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
soup = bs(r.content, 'lxml')
num_results = int(p.findall(r.text)[0])
results_per_page = 20
num_pages = math.ceil(num_results/results_per_page)
df = pd.read_html(str(soup.select_one('.table')))[0]
for i in range(2, num_pages + 1):
params[6][1] = str(i)
r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
soup = bs(r.content, 'lxml')
df_next = pd.read_html(str(soup.select_one('.table')))[0]
df = pd.concat([df, df_next])
df.drop('Unnamed: 6', axis = 1, inplace = True)
df.reset_index(drop=True)
检查:
print(len(df['Siren Number'].unique()))
#245
所以这里是完整的代码,同时考虑到 "Each results has a hyperlink I would also like to get information from into the final table"。因此,对于每个公司,我都会更新 headers,然后删除注册或删除的日期。可能有更优雅的方式来呈现代码...
from bs4 import BeautifulSoup as bs
import requests, re, math
import pandas as pd
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Referer': 'https://www.orias.fr/web/guest/search'
}
params = [['p_p_id', 'intermediaryDetailedSearch_WAR_oriasportlet'],
['p_p_lifecycle', '0'],
['p_p_state', 'normal'],
['p_p_mode', 'view'],
['p_p_col_id', 'column-1'],
['p_p_col_count', '1'],
['_intermediaryDetailedSearch_WAR_oriasportlet_d-16544-p', '1'],
['_intermediaryDetailedSearch_WAR_oriasportlet_implicitModel', 'true'],
['_intermediaryDetailedSearch_WAR_oriasportlet_spring_render', 'searchResult']]
data = {
'searchString': '',
'address': '',
'zipCodeOrCity': '',
'_coa': 'on',
'_aga': 'on',
'_ma': 'on',
'_mia': 'on',
'_euIAS': 'on',
'mandatorDenomination': '',
'wantsMandator': 'no',
'_cobsp': 'on',
'_mobspl': 'on',
'_mobsp': 'on',
'_miobsp': 'on',
'_bankActivities': '1',
'_euIOBSP': 'on',
'_cif': 'on',
'_alpsi': 'on',
'_cip': 'on',
'ifp': 'true',
'_ifp': 'on',
'submit': 'Search'
}
p = re.compile(r'(\d+)\s+intermediaries found')
def webdata(soup):
parsed_table = soup.find_all('table')[0]
dataweb = [[td.a['href'] if td.find('a') else
''.join(td.stripped_strings)
for td in row.find_all('td')]
for row in parsed_table.find_all('tr')]
dfweb = pd.DataFrame(dataweb[1:], columns=['SIREN','ID','website','category','zipcode','city','website2'])
dfweb = dfweb.loc[:,['ID','website']]
dfweb.ID = dfweb.ID.astype(int)
return dfweb
with requests.Session() as s:
r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
soup = bs(r.content, 'lxml')
num_results = int(p.findall(r.text)[0])
results_per_page = 20
num_pages = math.ceil(num_results/results_per_page)
df = pd.read_html(str(soup.select_one('.table')))[0]
dfweb = webdata(soup)
df = pd.merge(df,dfweb, on='ID')
for i in range(2, num_pages + 1):
params[6][1] = str(i)
r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
soup = bs(r.content, 'lxml')
df_next = pd.read_html(str(soup.select_one('.table')))[0]
dfweb = webdata(soup)
df_next = pd.merge(df_next ,dfweb, on='ID')
df = pd.concat([df, df_next])
df.drop('Unnamed: 6', axis = 1, inplace = True)
df = df.reset_index(drop=True)
# list the ORIAS identity number given to frims
# get le last 6 character of the link, but last is a space
df['oriasID'] = df.website.apply(lambda x: x[-7:][:6])
# remove = sign, non digit
df['oriasID'] = df.oriasID.apply(lambda y: ''.join(i for i in y if i.isdigit()))
# new parameters
def paramsub(IDi):
return [['p_p_id', 'intermediaryDetailedSearch_WAR_oriasportlet'],
['p_p_lifecycle', '1'],
['p_p_state', 'normal'],
['p_p_mode', 'view'],
['p_p_col_id', 'column-1'],
['p_p_col_count', '1'],
['_intermediaryDetailedSearch_WAR_oriasportlet_myaction', 'viewDetails'],
['_intermediaryDetailedSearch_WAR_oriasportlet_partyId', IDi]]
df['date in'] = False
df['date out'] = False
with requests.Session() as s:
for i in df.index:
IDi = df.loc[i,'oriasID']
r= requests.post('https://www.orias.fr/search', headers=headers, params= paramsub(IDi), data=data)
soup = bs(r.content, 'lxml')
# keep data inside blocint3 is "(IFP)" is in the text
for rowi in soup.find_all('div',{'class':'blocint3'}):
if 'IFP' in rowi.text:
if 'Deleted' in rowi.text:
# identify date
df.loc[i,'date out'] = re.search(r'\d{2}-\d{2}-\d{4}', rowi.text).group()
elif 'Registered' in rowi.text:
df.loc[i,'date in'] = re.search(r'\d{2}-\d{2}-\d{4}', rowi.text).group()
# dates to date format
df['date in'] = pd.to_datetime(df['date in'], format="%d-%m-%Y", errors='coerce')
df['date out'] = pd.to_datetime(df['date out'], format="%d-%m-%Y", errors='coerce')
# sort by dates
df = df.sort_values(by='date out',ascending=True)
df = df.sort_values(by='date in',ascending=True)
df = df.reset_index(drop=True)
# export
df.to_csv('20190817_ORIAS_in_out.csv')