我有来自 html 'copy element' 的一行,我正在尝试制作一个正则表达式以去除不相关的内容并保留列表
i have a line from html 'copy element' and i am trying to make a regex to strip out the irrelevant and keep the list
html_str:
'<canvas id="ratings_histogram" data-json="[10,4,4,5,8,8,9,10,10,7,9,6,6,7,3,7,7,2,4,3,10,4,8,4,4,4,8,10,4,7,10,7,5,8,1,10,6,4,9,7,1,1,7,10,3,9,5,1,4,7,2,1,6,5,2,5,7,10,10,5,5,4,1,7,10,6,7,4,4,4,9,1,7,10,1,6,10,1,3,2,8,10,3,10,9,2,6,1,4,1,8,3,5,10,5,5,4,3,7,3,4,1,1,6,10,3,1,2,6,7,4,7,6,1,7,3,5,4,6,4,7,9,10,3,7,5,8,5,6,6,1,5,4,10,3,1,2,1,6,10,8,8,6,6,10,7,7,7,7,7,4,10,1,6,6,4,4,7,7,5,1,4,10,4,1,10,2,3,2,7,10,4,1,8,8,6,7,2,10,8,8,3,1,7,10,10,8,5,1,1,5,10,6,6,4,10,1,8,4,8,3,10,10,3,10,5,1,10,4,7,9,6,3,6,6,6,9,9,8,7,1,6,6,5,4,3,7,10,1,4,5,10,7,1,10,1,8,4,9,10,5,1,7,9,4,8,7,10,9,10,9,1,7,6,1,7,1,1,7,5,9,2,10,3,3,4,1,10,9,2,9,10,6,1,3,5,1,6,1,7,5,8,10,4,10,7,3,10,3,5,10,10,6,10,1,4,9,7,7,6,6,1,4,4,7,5,5,10,1,10,7,4,5,7,7,10,1,3,10,6,8,5,10,10,9,5,5,7,2,9,1,8,10,10,5,6,3,4,1,5,7,4,1,1,1,8,8,8,8,10,6,10,10,6,9,4,8,4,6,7,10,9,8,10,7,7,1,1,5,7,6,8,7,10,6,7,7,6,10,9,10,7,8,8,7,1,10,9,5,7,7,9,8,4,10,4,7,1,7,10,1,7,9,7,5,10,8,8,10,8,10,8,8,10,1,10,7,5,3,1,10,7,1,9,10,7,5,8,7,6,5,10,5,5,10,6,6,4,9,7,8,1,2,2,7,4,1,8,7,7,1,10,3,1,6,10,1,8,7,8,6,5,6,8,6,3,10,5,6,2,7,8,9,10,10,2,1,5,10,1,10,4,1,7,7,10,8,1,4,3,3,9,9,10,9,10,6,4,5,10,10,8,3,8,8,10,7,5,7,9,1,5,10,2,7,5,9,10,1,4,7,8,2,10,10,8,4,10,10,8,5,4,4,10,1,8,6,6,5,7,6,7,8,10,10,6,4,7,5,4,9,6,10,10,1,6,10,1,1,8,10,4,7,4,7,7,6,7,7,9,7,4,10,4,7,6,8,4,5,9,6,8,8,6,5,8,5,6,10,9,3,7,10,9,8,7,7,6,7,5,5,8,8,10,7,8,1,2,7,7,10,10,1,6,7,10,9,8,9,1,1,10,8,2,9,1,4,4,5,6,2,1,4,10,6,1,3,2,10,2,8,7,10,8,5,6,10,7,7,1,6,7,8,6,8,6,1,1,5,8,8,1,4,5,7,10,5,6,6,7,7,6,1,7,7,5,7,7,4,4,8,6,2,9,6,7,10,2,3,8,10,6,2,9,3,4,10,2,7,8,7,6,7,4,7,7,9,5,7,7,6,7,7,7,10,9,10,4,10,8,10,5,6,4,8,7,7,5,6,8,10,8,6,9,7,9,8,7,5,8,2,6,10,10,10,8,2,7,3,8,1,6,5,7,10,7,5,5,8,2,9,6,7,9,6,8,6,7,6,5,8,6,10,1,5,7,7,8,7,7,7,10,10,7,10,3,8,1,8,10,5,7,8,7,8,8,4,1,10,7,7,10,9,5,7,10,10,10,9,10,1,10,8,4,4,10,9,5,6,9,7,7,10,5,10,10,7,6,6,7,6,4,7,8,7,7,3,7,10,2,6,8,10,7,8,1,10,7,9,7,10,6,6,5,9,7,7,3,7,10,5,7,9,10,5,5,7,8,8,8,8,1,9,8,10,6,7,8,4,8,7,6,8,8,8,7,7,10,8,6,10,10,9,3,7,5,8,9,7,8,7,3,7,9,7,6,7,10,6,10,8,1,7,8,7,8,9,8,8,1,10,9,10,7,8,3,3,10,9,9,6,1,4,8,6,4,8,5,9,10,8,10,8,6,6,6,6,9,9,10,8,8,8,7,3,7,6,6,1,10,7,6,4,10,4,3,3,5,6,9,10,7,10,5,6,1,10,10,6,1,9,5,4,7,6,4,7,10,5,6,10,1,7,7,8,4,6,10,9,7,9,9,4,10,7,4,6,9,5,4,10,9,2,7,5,7,5,7,5,8,7,8,10,4,5,8,1,6,8,5,10,5,7,6,9,10,5,8,10,9,10,8,1,7,8,7,6,10,8,10,4,8,6,10,7,6,10,6,9,6,1,7,1,8,10,10,9,1,10,10,6,4,9,10,4,8,10,8,10,8,3,5,4,1,1,6,7,8,7,2,10,8,8,9,8,6,7,5,1,8,3,10,10,10,3,10,8,6,9,4,8,5,8,6,9,10,7,6,10,7,10,10,5,10,5,6,7,10,7,6,9,6,6,10,5,8,8,4,7,7,7,6,9,8,2,6,3,7,4,4,6,8,8,10,7,9,9,5,10,8,8,8,9,8,9,10,10,1,9,2,10,5,2,7,8,7,7,7,9,8,8,8,7,8,10,4,5,7,6,8,7,7,4,8,8,10,1,10,8,8,6,10,7,2,2,1,10,10,7,7,7,7,3,10,10,7,7,10,6,1,8,10,6,10,10,8,8,4,10,9,1,9,9,6,7,7,7,6,1,1,10,7,2,10,9,5,8,3,8,7,5,8,10,10,6,4,10,9,8,4,5,10,10,8,8,8,8,7,9,1,2,6,5,3,10,10,9,7,7,10,6,5,6,6,6,5,7,7,10,7,7,1,10,6,10,9,10,3,8,1,7,5,7,7,10,10,1,4,10,8,1,9,10,7,8,6,7,10,6,6,4,1,7,6,8,9,10,7,8,8,7,9,10,10,10,8,9,6,4,9,7,10,7,3,8,8,7,5,7,5,10,7,9,7,9,7,10,8,6,10,8,10,4,1,1,7,10,1,5,7,8,7,7,7,5,8,7,7,8,6,10,7,5,6,3,5,8,2,8,7,8,7,7,9,7,10,3,7,10,7,9,5,10,8,7,10,7,8,10,8,4,7,9,7,10,10,9,10,3,9,9,7,4,6,7,5,10,1,5,7,10,9,7,10,3,7,7,3,9,7,7,10,6,8,7,8,6,7,8,1,3,1,10,10,8,7,3,8,6,9,4,8,8,8,10,9,10,10,7,8,10,8,10,10,9,10,4,2,7,7,10,7,8,7,7,9,8,9,9,9,8,9,7,4,10,10,7,10,7,8,8,10,5,7,7,10,10,8,7,7,7,7,9,8,7,5,7,4,5,7,7,8,6,1,9,7,8,7,9,10,9,1,10,6,7,10,7,9,10,9,8,7,10,8,7,10,9,8,7,9,9,10,8,7,8,5,10,8,5,9,5]" height="191" width="384" class="chartjs-render-monitor" style="display: block; height: 213px; width: 427px;"></canvas>'
我知道这很乱,但不知道如何编辑 html 以避免损坏东西。
我想要的是做一个 'small' 的东西,它只编辑 json - 括号之间的数据内容只是一个列表。
到目前为止我有:
val_list = re.match(r'[^[](\d+\,)+\w', html_str)
不包括左括号; 1 个或多个数字后跟一个逗号、整个选择的任意数字和最后一个数字。
将 NoneType 存储到 val_list。
目标是然后转换为列表,但如果有 'nothing' 转换为列表,我就卡住了。
下面提供答案后,完成代码:
def json_extract(html_str):
import xml.etree.ElementTree as ET
import json
import numpy
canvas = ET.fromstring(html_str)
if 'data-json' in canvas.attrib:
val_list = json.loads(canvas.attrib['data-json'])
tot = sum(val_list) + 45
bayes_average = tot/(len(val_list)+10)
average = numpy.mean(val_list)
return print('bayes average: {} \naverage {}'.format(bayes_average,average))
#new cell
html_str = '<canvas id="ratings_histogram" data-json="[10,4,4,5,8,8,9,10,10,7,9,6,6,7,3,7,7,2,4,3,10,4,8,4,4,4,8,10,4,7,10,7,5,8,1,10,6,4,9,7,1,1,7,10,3,9,5,1,4,7,2,1,6,5,2,5,7,10,10,5,5,4,1,7,10,6,7,4,4,4,9,1,7,10,1,6,10,1,3,2,8,10,3,10,9,2,6,1,4,1,8,3,5,10,5,5,4,3,7,3,4,1,1,6,10,3,1,2,6,7,4,7,6,1,7,3,5,4,6,4,7,9,10,3,7,5,8,5,6,6,1,5,4,10,3,1,2,1,6,10,8,8,6,6,10,7,7,7,7,7,4,10,1,6,6,4,4,7,7,5,1,4,10,4,1,10,2,3,2,7,10,4,1,8,8,6,7,2,10,8,8,3,1,7,10,10,8,5,1,1,5,10,6,6,4,10,1,8,4,8,3,10,10,3,10,5,1,10,4,7,9,6,3,6,6,6,9,9,8,7,1,6,6,5,4,3,7,10,1,4,5,10,7,1,10,1,8,4,9,10,5,1,7,9,4,8,7,10,9,10,9,1,7,6,1,7,1,1,7,5,9,2,10,3,3,4,1,10,9,2,9,10,6,1,3,5,1,6,1,7,5,8,10,4,10,7,3,10,3,5,10,10,6,10,1,4,9,7,7,6,6,1,4,4,7,5,5,10,1,10,7,4,5,7,7,10,1,3,10,6,8,5,10,10,9,5,5,7,2,9,1,8,10,10,5,6,3,4,1,5,7,4,1,1,1,8,8,8,8,10,6,10,10,6,9,4,8,4,6,7,10,9,8,10,7,7,1,1,5,7,6,8,7,10,6,7,7,6,10,9,10,7,8,8,7,1,10,9,5,7,7,9,8,4,10,4,7,1,7,10,1,7,9,7,5,10,8,8,10,8,10,8,8,10,1,10,7,5,3,1,10,7,1,9,10,7,5,8,7,6,5,10,5,5,10,6,6,4,9,7,8,1,2,2,7,4,1,8,7,7,1,10,3,1,6,10,1,8,7,8,6,5,6,8,6,3,10,5,6,2,7,8,9,10,10,2,1,5,10,1,10,4,1,7,7,10,8,1,4,3,3,9,9,10,9,10,6,4,5,10,10,8,3,8,8,10,7,5,7,9,1,5,10,2,7,5,9,10,1,4,7,8,2,10,10,8,4,10,10,8,5,4,4,10,1,8,6,6,5,7,6,7,8,10,10,6,4,7,5,4,9,6,10,10,1,6,10,1,1,8,10,4,7,4,7,7,6,7,7,9,7,4,10,4,7,6,8,4,5,9,6,8,8,6,5,8,5,6,10,9,3,7,10,9,8,7,7,6,7,5,5,8,8,10,7,8,1,2,7,7,10,10,1,6,7,10,9,8,9,1,1,10,8,2,9,1,4,4,5,6,2,1,4,10,6,1,3,2,10,2,8,7,10,8,5,6,10,7,7,1,6,7,8,6,8,6,1,1,5,8,8,1,4,5,7,10,5,6,6,7,7,6,1,7,7,5,7,7,4,4,8,6,2,9,6,7,10,2,3,8,10,6,2,9,3,4,10,2,7,8,7,6,7,4,7,7,9,5,7,7,6,7,7,7,10,9,10,4,10,8,10,5,6,4,8,7,7,5,6,8,10,8,6,9,7,9,8,7,5,8,2,6,10,10,10,8,2,7,3,8,1,6,5,7,10,7,5,5,8,2,9,6,7,9,6,8,6,7,6,5,8,6,10,1,5,7,7,8,7,7,7,10,10,7,10,3,8,1,8,10,5,7,8,7,8,8,4,1,10,7,7,10,9,5,7,10,10,10,9,10,1,10,8,4,4,10,9,5,6,9,7,7,10,5,10,10,7,6,6,7,6,4,7,8,7,7,3,7,10,2,6,8,10,7,8,1,10,7,9,7,10,6,6,5,9,7,7,3,7,10,5,7,9,10,5,5,7,8,8,8,8,1,9,8,10,6,7,8,4,8,7,6,8,8,8,7,7,10,8,6,10,10,9,3,7,5,8,9,7,8,7,3,7,9,7,6,7,10,6,10,8,1,7,8,7,8,9,8,8,1,10,9,10,7,8,3,3,10,9,9,6,1,4,8,6,4,8,5,9,10,8,10,8,6,6,6,6,9,9,10,8,8,8,7,3,7,6,6,1,10,7,6,4,10,4,3,3,5,6,9,10,7,10,5,6,1,10,10,6,1,9,5,4,7,6,4,7,10,5,6,10,1,7,7,8,4,6,10,9,7,9,9,4,10,7,4,6,9,5,4,10,9,2,7,5,7,5,7,5,8,7,8,10,4,5,8,1,6,8,5,10,5,7,6,9,10,5,8,10,9,10,8,1,7,8,7,6,10,8,10,4,8,6,10,7,6,10,6,9,6,1,7,1,8,10,10,9,1,10,10,6,4,9,10,4,8,10,8,10,8,3,5,4,1,1,6,7,8,7,2,10,8,8,9,8,6,7,5,1,8,3,10,10,10,3,10,8,6,9,4,8,5,8,6,9,10,7,6,10,7,10,10,5,10,5,6,7,10,7,6,9,6,6,10,5,8,8,4,7,7,7,6,9,8,2,6,3,7,4,4,6,8,8,10,7,9,9,5,10,8,8,8,9,8,9,10,10,1,9,2,10,5,2,7,8,7,7,7,9,8,8,8,7,8,10,4,5,7,6,8,7,7,4,8,8,10,1,10,8,8,6,10,7,2,2,1,10,10,7,7,7,7,3,10,10,7,7,10,6,1,8,10,6,10,10,8,8,4,10,9,1,9,9,6,7,7,7,6,1,1,10,7,2,10,9,5,8,3,8,7,5,8,10,10,6,4,10,9,8,4,5,10,10,8,8,8,8,7,9,1,2,6,5,3,10,10,9,7,7,10,6,5,6,6,6,5,7,7,10,7,7,1,10,6,10,9,10,3,8,1,7,5,7,7,10,10,1,4,10,8,1,9,10,7,8,6,7,10,6,6,4,1,7,6,8,9,10,7,8,8,7,9,10,10,10,8,9,6,4,9,7,10,7,3,8,8,7,5,7,5,10,7,9,7,9,7,10,8,6,10,8,10,4,1,1,7,10,1,5,7,8,7,7,7,5,8,7,7,8,6,10,7,5,6,3,5,8,2,8,7,8,7,7,9,7,10,3,7,10,7,9,5,10,8,7,10,7,8,10,8,4,7,9,7,10,10,9,10,3,9,9,7,4,6,7,5,10,1,5,7,10,9,7,10,3,7,7,3,9,7,7,10,6,8,7,8,6,7,8,1,3,1,10,10,8,7,3,8,6,9,4,8,8,8,10,9,10,10,7,8,10,8,10,10,9,10,4,2,7,7,10,7,8,7,7,9,8,9,9,9,8,9,7,4,10,10,7,10,7,8,8,10,5,7,7,10,10,8,7,7,7,7,9,8,7,5,7,4,5,7,7,8,6,1,9,7,8,7,9,10,9,1,10,6,7,10,7,9,10,9,8,7,10,8,7,10,9,8,7,9,9,10,8,7,8,5,10,8,5,9,5]" height="191" width="384" class="chartjs-render-monitor" style="display: block; height: 213px; width: 427px;"></canvas>'
json_extract(html_str)
其中,当您从某个网站复制元素时,您可以找到实际平均值和贝叶斯平滑评分(每个类别加 1)。
在该网站上的几个系列上进行测试后,它们的平均值在某些方面很接近,而在其他方面则很不接近。
注意:该代码适用于任何数字列表。
我建议使用 XML 解析器来获取您需要的数据。然后您可以将其解析为 JSON 字符串。
import xml.etree.ElementTree as ET
import json
canvas = ET.fromstring(html_str)
if 'data-json' in canvas.attrib:
data = json.loads(canvas.attrib['data-json'])
html_str:
'<canvas id="ratings_histogram" data-json="[10,4,4,5,8,8,9,10,10,7,9,6,6,7,3,7,7,2,4,3,10,4,8,4,4,4,8,10,4,7,10,7,5,8,1,10,6,4,9,7,1,1,7,10,3,9,5,1,4,7,2,1,6,5,2,5,7,10,10,5,5,4,1,7,10,6,7,4,4,4,9,1,7,10,1,6,10,1,3,2,8,10,3,10,9,2,6,1,4,1,8,3,5,10,5,5,4,3,7,3,4,1,1,6,10,3,1,2,6,7,4,7,6,1,7,3,5,4,6,4,7,9,10,3,7,5,8,5,6,6,1,5,4,10,3,1,2,1,6,10,8,8,6,6,10,7,7,7,7,7,4,10,1,6,6,4,4,7,7,5,1,4,10,4,1,10,2,3,2,7,10,4,1,8,8,6,7,2,10,8,8,3,1,7,10,10,8,5,1,1,5,10,6,6,4,10,1,8,4,8,3,10,10,3,10,5,1,10,4,7,9,6,3,6,6,6,9,9,8,7,1,6,6,5,4,3,7,10,1,4,5,10,7,1,10,1,8,4,9,10,5,1,7,9,4,8,7,10,9,10,9,1,7,6,1,7,1,1,7,5,9,2,10,3,3,4,1,10,9,2,9,10,6,1,3,5,1,6,1,7,5,8,10,4,10,7,3,10,3,5,10,10,6,10,1,4,9,7,7,6,6,1,4,4,7,5,5,10,1,10,7,4,5,7,7,10,1,3,10,6,8,5,10,10,9,5,5,7,2,9,1,8,10,10,5,6,3,4,1,5,7,4,1,1,1,8,8,8,8,10,6,10,10,6,9,4,8,4,6,7,10,9,8,10,7,7,1,1,5,7,6,8,7,10,6,7,7,6,10,9,10,7,8,8,7,1,10,9,5,7,7,9,8,4,10,4,7,1,7,10,1,7,9,7,5,10,8,8,10,8,10,8,8,10,1,10,7,5,3,1,10,7,1,9,10,7,5,8,7,6,5,10,5,5,10,6,6,4,9,7,8,1,2,2,7,4,1,8,7,7,1,10,3,1,6,10,1,8,7,8,6,5,6,8,6,3,10,5,6,2,7,8,9,10,10,2,1,5,10,1,10,4,1,7,7,10,8,1,4,3,3,9,9,10,9,10,6,4,5,10,10,8,3,8,8,10,7,5,7,9,1,5,10,2,7,5,9,10,1,4,7,8,2,10,10,8,4,10,10,8,5,4,4,10,1,8,6,6,5,7,6,7,8,10,10,6,4,7,5,4,9,6,10,10,1,6,10,1,1,8,10,4,7,4,7,7,6,7,7,9,7,4,10,4,7,6,8,4,5,9,6,8,8,6,5,8,5,6,10,9,3,7,10,9,8,7,7,6,7,5,5,8,8,10,7,8,1,2,7,7,10,10,1,6,7,10,9,8,9,1,1,10,8,2,9,1,4,4,5,6,2,1,4,10,6,1,3,2,10,2,8,7,10,8,5,6,10,7,7,1,6,7,8,6,8,6,1,1,5,8,8,1,4,5,7,10,5,6,6,7,7,6,1,7,7,5,7,7,4,4,8,6,2,9,6,7,10,2,3,8,10,6,2,9,3,4,10,2,7,8,7,6,7,4,7,7,9,5,7,7,6,7,7,7,10,9,10,4,10,8,10,5,6,4,8,7,7,5,6,8,10,8,6,9,7,9,8,7,5,8,2,6,10,10,10,8,2,7,3,8,1,6,5,7,10,7,5,5,8,2,9,6,7,9,6,8,6,7,6,5,8,6,10,1,5,7,7,8,7,7,7,10,10,7,10,3,8,1,8,10,5,7,8,7,8,8,4,1,10,7,7,10,9,5,7,10,10,10,9,10,1,10,8,4,4,10,9,5,6,9,7,7,10,5,10,10,7,6,6,7,6,4,7,8,7,7,3,7,10,2,6,8,10,7,8,1,10,7,9,7,10,6,6,5,9,7,7,3,7,10,5,7,9,10,5,5,7,8,8,8,8,1,9,8,10,6,7,8,4,8,7,6,8,8,8,7,7,10,8,6,10,10,9,3,7,5,8,9,7,8,7,3,7,9,7,6,7,10,6,10,8,1,7,8,7,8,9,8,8,1,10,9,10,7,8,3,3,10,9,9,6,1,4,8,6,4,8,5,9,10,8,10,8,6,6,6,6,9,9,10,8,8,8,7,3,7,6,6,1,10,7,6,4,10,4,3,3,5,6,9,10,7,10,5,6,1,10,10,6,1,9,5,4,7,6,4,7,10,5,6,10,1,7,7,8,4,6,10,9,7,9,9,4,10,7,4,6,9,5,4,10,9,2,7,5,7,5,7,5,8,7,8,10,4,5,8,1,6,8,5,10,5,7,6,9,10,5,8,10,9,10,8,1,7,8,7,6,10,8,10,4,8,6,10,7,6,10,6,9,6,1,7,1,8,10,10,9,1,10,10,6,4,9,10,4,8,10,8,10,8,3,5,4,1,1,6,7,8,7,2,10,8,8,9,8,6,7,5,1,8,3,10,10,10,3,10,8,6,9,4,8,5,8,6,9,10,7,6,10,7,10,10,5,10,5,6,7,10,7,6,9,6,6,10,5,8,8,4,7,7,7,6,9,8,2,6,3,7,4,4,6,8,8,10,7,9,9,5,10,8,8,8,9,8,9,10,10,1,9,2,10,5,2,7,8,7,7,7,9,8,8,8,7,8,10,4,5,7,6,8,7,7,4,8,8,10,1,10,8,8,6,10,7,2,2,1,10,10,7,7,7,7,3,10,10,7,7,10,6,1,8,10,6,10,10,8,8,4,10,9,1,9,9,6,7,7,7,6,1,1,10,7,2,10,9,5,8,3,8,7,5,8,10,10,6,4,10,9,8,4,5,10,10,8,8,8,8,7,9,1,2,6,5,3,10,10,9,7,7,10,6,5,6,6,6,5,7,7,10,7,7,1,10,6,10,9,10,3,8,1,7,5,7,7,10,10,1,4,10,8,1,9,10,7,8,6,7,10,6,6,4,1,7,6,8,9,10,7,8,8,7,9,10,10,10,8,9,6,4,9,7,10,7,3,8,8,7,5,7,5,10,7,9,7,9,7,10,8,6,10,8,10,4,1,1,7,10,1,5,7,8,7,7,7,5,8,7,7,8,6,10,7,5,6,3,5,8,2,8,7,8,7,7,9,7,10,3,7,10,7,9,5,10,8,7,10,7,8,10,8,4,7,9,7,10,10,9,10,3,9,9,7,4,6,7,5,10,1,5,7,10,9,7,10,3,7,7,3,9,7,7,10,6,8,7,8,6,7,8,1,3,1,10,10,8,7,3,8,6,9,4,8,8,8,10,9,10,10,7,8,10,8,10,10,9,10,4,2,7,7,10,7,8,7,7,9,8,9,9,9,8,9,7,4,10,10,7,10,7,8,8,10,5,7,7,10,10,8,7,7,7,7,9,8,7,5,7,4,5,7,7,8,6,1,9,7,8,7,9,10,9,1,10,6,7,10,7,9,10,9,8,7,10,8,7,10,9,8,7,9,9,10,8,7,8,5,10,8,5,9,5]" height="191" width="384" class="chartjs-render-monitor" style="display: block; height: 213px; width: 427px;"></canvas>'
我知道这很乱,但不知道如何编辑 html 以避免损坏东西。
我想要的是做一个 'small' 的东西,它只编辑 json - 括号之间的数据内容只是一个列表。
到目前为止我有:
val_list = re.match(r'[^[](\d+\,)+\w', html_str)
不包括左括号; 1 个或多个数字后跟一个逗号、整个选择的任意数字和最后一个数字。
将 NoneType 存储到 val_list。
目标是然后转换为列表,但如果有 'nothing' 转换为列表,我就卡住了。
下面提供答案后,完成代码:
def json_extract(html_str):
import xml.etree.ElementTree as ET
import json
import numpy
canvas = ET.fromstring(html_str)
if 'data-json' in canvas.attrib:
val_list = json.loads(canvas.attrib['data-json'])
tot = sum(val_list) + 45
bayes_average = tot/(len(val_list)+10)
average = numpy.mean(val_list)
return print('bayes average: {} \naverage {}'.format(bayes_average,average))
#new cell
html_str = '<canvas id="ratings_histogram" data-json="[10,4,4,5,8,8,9,10,10,7,9,6,6,7,3,7,7,2,4,3,10,4,8,4,4,4,8,10,4,7,10,7,5,8,1,10,6,4,9,7,1,1,7,10,3,9,5,1,4,7,2,1,6,5,2,5,7,10,10,5,5,4,1,7,10,6,7,4,4,4,9,1,7,10,1,6,10,1,3,2,8,10,3,10,9,2,6,1,4,1,8,3,5,10,5,5,4,3,7,3,4,1,1,6,10,3,1,2,6,7,4,7,6,1,7,3,5,4,6,4,7,9,10,3,7,5,8,5,6,6,1,5,4,10,3,1,2,1,6,10,8,8,6,6,10,7,7,7,7,7,4,10,1,6,6,4,4,7,7,5,1,4,10,4,1,10,2,3,2,7,10,4,1,8,8,6,7,2,10,8,8,3,1,7,10,10,8,5,1,1,5,10,6,6,4,10,1,8,4,8,3,10,10,3,10,5,1,10,4,7,9,6,3,6,6,6,9,9,8,7,1,6,6,5,4,3,7,10,1,4,5,10,7,1,10,1,8,4,9,10,5,1,7,9,4,8,7,10,9,10,9,1,7,6,1,7,1,1,7,5,9,2,10,3,3,4,1,10,9,2,9,10,6,1,3,5,1,6,1,7,5,8,10,4,10,7,3,10,3,5,10,10,6,10,1,4,9,7,7,6,6,1,4,4,7,5,5,10,1,10,7,4,5,7,7,10,1,3,10,6,8,5,10,10,9,5,5,7,2,9,1,8,10,10,5,6,3,4,1,5,7,4,1,1,1,8,8,8,8,10,6,10,10,6,9,4,8,4,6,7,10,9,8,10,7,7,1,1,5,7,6,8,7,10,6,7,7,6,10,9,10,7,8,8,7,1,10,9,5,7,7,9,8,4,10,4,7,1,7,10,1,7,9,7,5,10,8,8,10,8,10,8,8,10,1,10,7,5,3,1,10,7,1,9,10,7,5,8,7,6,5,10,5,5,10,6,6,4,9,7,8,1,2,2,7,4,1,8,7,7,1,10,3,1,6,10,1,8,7,8,6,5,6,8,6,3,10,5,6,2,7,8,9,10,10,2,1,5,10,1,10,4,1,7,7,10,8,1,4,3,3,9,9,10,9,10,6,4,5,10,10,8,3,8,8,10,7,5,7,9,1,5,10,2,7,5,9,10,1,4,7,8,2,10,10,8,4,10,10,8,5,4,4,10,1,8,6,6,5,7,6,7,8,10,10,6,4,7,5,4,9,6,10,10,1,6,10,1,1,8,10,4,7,4,7,7,6,7,7,9,7,4,10,4,7,6,8,4,5,9,6,8,8,6,5,8,5,6,10,9,3,7,10,9,8,7,7,6,7,5,5,8,8,10,7,8,1,2,7,7,10,10,1,6,7,10,9,8,9,1,1,10,8,2,9,1,4,4,5,6,2,1,4,10,6,1,3,2,10,2,8,7,10,8,5,6,10,7,7,1,6,7,8,6,8,6,1,1,5,8,8,1,4,5,7,10,5,6,6,7,7,6,1,7,7,5,7,7,4,4,8,6,2,9,6,7,10,2,3,8,10,6,2,9,3,4,10,2,7,8,7,6,7,4,7,7,9,5,7,7,6,7,7,7,10,9,10,4,10,8,10,5,6,4,8,7,7,5,6,8,10,8,6,9,7,9,8,7,5,8,2,6,10,10,10,8,2,7,3,8,1,6,5,7,10,7,5,5,8,2,9,6,7,9,6,8,6,7,6,5,8,6,10,1,5,7,7,8,7,7,7,10,10,7,10,3,8,1,8,10,5,7,8,7,8,8,4,1,10,7,7,10,9,5,7,10,10,10,9,10,1,10,8,4,4,10,9,5,6,9,7,7,10,5,10,10,7,6,6,7,6,4,7,8,7,7,3,7,10,2,6,8,10,7,8,1,10,7,9,7,10,6,6,5,9,7,7,3,7,10,5,7,9,10,5,5,7,8,8,8,8,1,9,8,10,6,7,8,4,8,7,6,8,8,8,7,7,10,8,6,10,10,9,3,7,5,8,9,7,8,7,3,7,9,7,6,7,10,6,10,8,1,7,8,7,8,9,8,8,1,10,9,10,7,8,3,3,10,9,9,6,1,4,8,6,4,8,5,9,10,8,10,8,6,6,6,6,9,9,10,8,8,8,7,3,7,6,6,1,10,7,6,4,10,4,3,3,5,6,9,10,7,10,5,6,1,10,10,6,1,9,5,4,7,6,4,7,10,5,6,10,1,7,7,8,4,6,10,9,7,9,9,4,10,7,4,6,9,5,4,10,9,2,7,5,7,5,7,5,8,7,8,10,4,5,8,1,6,8,5,10,5,7,6,9,10,5,8,10,9,10,8,1,7,8,7,6,10,8,10,4,8,6,10,7,6,10,6,9,6,1,7,1,8,10,10,9,1,10,10,6,4,9,10,4,8,10,8,10,8,3,5,4,1,1,6,7,8,7,2,10,8,8,9,8,6,7,5,1,8,3,10,10,10,3,10,8,6,9,4,8,5,8,6,9,10,7,6,10,7,10,10,5,10,5,6,7,10,7,6,9,6,6,10,5,8,8,4,7,7,7,6,9,8,2,6,3,7,4,4,6,8,8,10,7,9,9,5,10,8,8,8,9,8,9,10,10,1,9,2,10,5,2,7,8,7,7,7,9,8,8,8,7,8,10,4,5,7,6,8,7,7,4,8,8,10,1,10,8,8,6,10,7,2,2,1,10,10,7,7,7,7,3,10,10,7,7,10,6,1,8,10,6,10,10,8,8,4,10,9,1,9,9,6,7,7,7,6,1,1,10,7,2,10,9,5,8,3,8,7,5,8,10,10,6,4,10,9,8,4,5,10,10,8,8,8,8,7,9,1,2,6,5,3,10,10,9,7,7,10,6,5,6,6,6,5,7,7,10,7,7,1,10,6,10,9,10,3,8,1,7,5,7,7,10,10,1,4,10,8,1,9,10,7,8,6,7,10,6,6,4,1,7,6,8,9,10,7,8,8,7,9,10,10,10,8,9,6,4,9,7,10,7,3,8,8,7,5,7,5,10,7,9,7,9,7,10,8,6,10,8,10,4,1,1,7,10,1,5,7,8,7,7,7,5,8,7,7,8,6,10,7,5,6,3,5,8,2,8,7,8,7,7,9,7,10,3,7,10,7,9,5,10,8,7,10,7,8,10,8,4,7,9,7,10,10,9,10,3,9,9,7,4,6,7,5,10,1,5,7,10,9,7,10,3,7,7,3,9,7,7,10,6,8,7,8,6,7,8,1,3,1,10,10,8,7,3,8,6,9,4,8,8,8,10,9,10,10,7,8,10,8,10,10,9,10,4,2,7,7,10,7,8,7,7,9,8,9,9,9,8,9,7,4,10,10,7,10,7,8,8,10,5,7,7,10,10,8,7,7,7,7,9,8,7,5,7,4,5,7,7,8,6,1,9,7,8,7,9,10,9,1,10,6,7,10,7,9,10,9,8,7,10,8,7,10,9,8,7,9,9,10,8,7,8,5,10,8,5,9,5]" height="191" width="384" class="chartjs-render-monitor" style="display: block; height: 213px; width: 427px;"></canvas>'
json_extract(html_str)
其中,当您从某个网站复制元素时,您可以找到实际平均值和贝叶斯平滑评分(每个类别加 1)。
在该网站上的几个系列上进行测试后,它们的平均值在某些方面很接近,而在其他方面则很不接近。
注意:该代码适用于任何数字列表。
我建议使用 XML 解析器来获取您需要的数据。然后您可以将其解析为 JSON 字符串。
import xml.etree.ElementTree as ET
import json
canvas = ET.fromstring(html_str)
if 'data-json' in canvas.attrib:
data = json.loads(canvas.attrib['data-json'])