我有来自 html 'copy element' 的一行,我正在尝试制作一个正则表达式以去除不相关的内容并保留列表

i have a line from html 'copy element' and i am trying to make a regex to strip out the irrelevant and keep the list

html_str:

'<canvas id="ratings_histogram" data-json="[10,4,4,5,8,8,9,10,10,7,9,6,6,7,3,7,7,2,4,3,10,4,8,4,4,4,8,10,4,7,10,7,5,8,1,10,6,4,9,7,1,1,7,10,3,9,5,1,4,7,2,1,6,5,2,5,7,10,10,5,5,4,1,7,10,6,7,4,4,4,9,1,7,10,1,6,10,1,3,2,8,10,3,10,9,2,6,1,4,1,8,3,5,10,5,5,4,3,7,3,4,1,1,6,10,3,1,2,6,7,4,7,6,1,7,3,5,4,6,4,7,9,10,3,7,5,8,5,6,6,1,5,4,10,3,1,2,1,6,10,8,8,6,6,10,7,7,7,7,7,4,10,1,6,6,4,4,7,7,5,1,4,10,4,1,10,2,3,2,7,10,4,1,8,8,6,7,2,10,8,8,3,1,7,10,10,8,5,1,1,5,10,6,6,4,10,1,8,4,8,3,10,10,3,10,5,1,10,4,7,9,6,3,6,6,6,9,9,8,7,1,6,6,5,4,3,7,10,1,4,5,10,7,1,10,1,8,4,9,10,5,1,7,9,4,8,7,10,9,10,9,1,7,6,1,7,1,1,7,5,9,2,10,3,3,4,1,10,9,2,9,10,6,1,3,5,1,6,1,7,5,8,10,4,10,7,3,10,3,5,10,10,6,10,1,4,9,7,7,6,6,1,4,4,7,5,5,10,1,10,7,4,5,7,7,10,1,3,10,6,8,5,10,10,9,5,5,7,2,9,1,8,10,10,5,6,3,4,1,5,7,4,1,1,1,8,8,8,8,10,6,10,10,6,9,4,8,4,6,7,10,9,8,10,7,7,1,1,5,7,6,8,7,10,6,7,7,6,10,9,10,7,8,8,7,1,10,9,5,7,7,9,8,4,10,4,7,1,7,10,1,7,9,7,5,10,8,8,10,8,10,8,8,10,1,10,7,5,3,1,10,7,1,9,10,7,5,8,7,6,5,10,5,5,10,6,6,4,9,7,8,1,2,2,7,4,1,8,7,7,1,10,3,1,6,10,1,8,7,8,6,5,6,8,6,3,10,5,6,2,7,8,9,10,10,2,1,5,10,1,10,4,1,7,7,10,8,1,4,3,3,9,9,10,9,10,6,4,5,10,10,8,3,8,8,10,7,5,7,9,1,5,10,2,7,5,9,10,1,4,7,8,2,10,10,8,4,10,10,8,5,4,4,10,1,8,6,6,5,7,6,7,8,10,10,6,4,7,5,4,9,6,10,10,1,6,10,1,1,8,10,4,7,4,7,7,6,7,7,9,7,4,10,4,7,6,8,4,5,9,6,8,8,6,5,8,5,6,10,9,3,7,10,9,8,7,7,6,7,5,5,8,8,10,7,8,1,2,7,7,10,10,1,6,7,10,9,8,9,1,1,10,8,2,9,1,4,4,5,6,2,1,4,10,6,1,3,2,10,2,8,7,10,8,5,6,10,7,7,1,6,7,8,6,8,6,1,1,5,8,8,1,4,5,7,10,5,6,6,7,7,6,1,7,7,5,7,7,4,4,8,6,2,9,6,7,10,2,3,8,10,6,2,9,3,4,10,2,7,8,7,6,7,4,7,7,9,5,7,7,6,7,7,7,10,9,10,4,10,8,10,5,6,4,8,7,7,5,6,8,10,8,6,9,7,9,8,7,5,8,2,6,10,10,10,8,2,7,3,8,1,6,5,7,10,7,5,5,8,2,9,6,7,9,6,8,6,7,6,5,8,6,10,1,5,7,7,8,7,7,7,10,10,7,10,3,8,1,8,10,5,7,8,7,8,8,4,1,10,7,7,10,9,5,7,10,10,10,9,10,1,10,8,4,4,10,9,5,6,9,7,7,10,5,10,10,7,6,6,7,6,4,7,8,7,7,3,7,10,2,6,8,10,7,8,1,10,7,9,7,10,6,6,5,9,7,7,3,7,10,5,7,9,10,5,5,7,8,8,8,8,1,9,8,10,6,7,8,4,8,7,6,8,8,8,7,7,10,8,6,10,10,9,3,7,5,8,9,7,8,7,3,7,9,7,6,7,10,6,10,8,1,7,8,7,8,9,8,8,1,10,9,10,7,8,3,3,10,9,9,6,1,4,8,6,4,8,5,9,10,8,10,8,6,6,6,6,9,9,10,8,8,8,7,3,7,6,6,1,10,7,6,4,10,4,3,3,5,6,9,10,7,10,5,6,1,10,10,6,1,9,5,4,7,6,4,7,10,5,6,10,1,7,7,8,4,6,10,9,7,9,9,4,10,7,4,6,9,5,4,10,9,2,7,5,7,5,7,5,8,7,8,10,4,5,8,1,6,8,5,10,5,7,6,9,10,5,8,10,9,10,8,1,7,8,7,6,10,8,10,4,8,6,10,7,6,10,6,9,6,1,7,1,8,10,10,9,1,10,10,6,4,9,10,4,8,10,8,10,8,3,5,4,1,1,6,7,8,7,2,10,8,8,9,8,6,7,5,1,8,3,10,10,10,3,10,8,6,9,4,8,5,8,6,9,10,7,6,10,7,10,10,5,10,5,6,7,10,7,6,9,6,6,10,5,8,8,4,7,7,7,6,9,8,2,6,3,7,4,4,6,8,8,10,7,9,9,5,10,8,8,8,9,8,9,10,10,1,9,2,10,5,2,7,8,7,7,7,9,8,8,8,7,8,10,4,5,7,6,8,7,7,4,8,8,10,1,10,8,8,6,10,7,2,2,1,10,10,7,7,7,7,3,10,10,7,7,10,6,1,8,10,6,10,10,8,8,4,10,9,1,9,9,6,7,7,7,6,1,1,10,7,2,10,9,5,8,3,8,7,5,8,10,10,6,4,10,9,8,4,5,10,10,8,8,8,8,7,9,1,2,6,5,3,10,10,9,7,7,10,6,5,6,6,6,5,7,7,10,7,7,1,10,6,10,9,10,3,8,1,7,5,7,7,10,10,1,4,10,8,1,9,10,7,8,6,7,10,6,6,4,1,7,6,8,9,10,7,8,8,7,9,10,10,10,8,9,6,4,9,7,10,7,3,8,8,7,5,7,5,10,7,9,7,9,7,10,8,6,10,8,10,4,1,1,7,10,1,5,7,8,7,7,7,5,8,7,7,8,6,10,7,5,6,3,5,8,2,8,7,8,7,7,9,7,10,3,7,10,7,9,5,10,8,7,10,7,8,10,8,4,7,9,7,10,10,9,10,3,9,9,7,4,6,7,5,10,1,5,7,10,9,7,10,3,7,7,3,9,7,7,10,6,8,7,8,6,7,8,1,3,1,10,10,8,7,3,8,6,9,4,8,8,8,10,9,10,10,7,8,10,8,10,10,9,10,4,2,7,7,10,7,8,7,7,9,8,9,9,9,8,9,7,4,10,10,7,10,7,8,8,10,5,7,7,10,10,8,7,7,7,7,9,8,7,5,7,4,5,7,7,8,6,1,9,7,8,7,9,10,9,1,10,6,7,10,7,9,10,9,8,7,10,8,7,10,9,8,7,9,9,10,8,7,8,5,10,8,5,9,5]" height="191" width="384" class="chartjs-render-monitor" style="display: block; height: 213px; width: 427px;"></canvas>'

我知道这很乱,但不知道如何编辑 html 以避免损坏东西。

我想要的是做一个 'small' 的东西,它只编辑 json - 括号之间的数据内容只是一个列表。

到目前为止我有:

val_list = re.match(r'[^[](\d+\,)+\w', html_str)

不包括左括号; 1 个或多个数字后跟一个逗号、整个选择的任意数字和最后一个数字。

将 NoneType 存储到 val_list。

目标是然后转换为列表,但如果有 'nothing' 转换为列表,我就卡住了。

下面提供答案后,完成代码:

def json_extract(html_str):
    import xml.etree.ElementTree as ET
    import json
    import numpy
    
    canvas = ET.fromstring(html_str)
    if 'data-json' in canvas.attrib:
        val_list = json.loads(canvas.attrib['data-json'])    

    tot = sum(val_list) + 45    
    bayes_average = tot/(len(val_list)+10)    
    average = numpy.mean(val_list)
    
    return print('bayes average: {} \naverage {}'.format(bayes_average,average))


#new cell
html_str = '<canvas id="ratings_histogram" data-json="[10,4,4,5,8,8,9,10,10,7,9,6,6,7,3,7,7,2,4,3,10,4,8,4,4,4,8,10,4,7,10,7,5,8,1,10,6,4,9,7,1,1,7,10,3,9,5,1,4,7,2,1,6,5,2,5,7,10,10,5,5,4,1,7,10,6,7,4,4,4,9,1,7,10,1,6,10,1,3,2,8,10,3,10,9,2,6,1,4,1,8,3,5,10,5,5,4,3,7,3,4,1,1,6,10,3,1,2,6,7,4,7,6,1,7,3,5,4,6,4,7,9,10,3,7,5,8,5,6,6,1,5,4,10,3,1,2,1,6,10,8,8,6,6,10,7,7,7,7,7,4,10,1,6,6,4,4,7,7,5,1,4,10,4,1,10,2,3,2,7,10,4,1,8,8,6,7,2,10,8,8,3,1,7,10,10,8,5,1,1,5,10,6,6,4,10,1,8,4,8,3,10,10,3,10,5,1,10,4,7,9,6,3,6,6,6,9,9,8,7,1,6,6,5,4,3,7,10,1,4,5,10,7,1,10,1,8,4,9,10,5,1,7,9,4,8,7,10,9,10,9,1,7,6,1,7,1,1,7,5,9,2,10,3,3,4,1,10,9,2,9,10,6,1,3,5,1,6,1,7,5,8,10,4,10,7,3,10,3,5,10,10,6,10,1,4,9,7,7,6,6,1,4,4,7,5,5,10,1,10,7,4,5,7,7,10,1,3,10,6,8,5,10,10,9,5,5,7,2,9,1,8,10,10,5,6,3,4,1,5,7,4,1,1,1,8,8,8,8,10,6,10,10,6,9,4,8,4,6,7,10,9,8,10,7,7,1,1,5,7,6,8,7,10,6,7,7,6,10,9,10,7,8,8,7,1,10,9,5,7,7,9,8,4,10,4,7,1,7,10,1,7,9,7,5,10,8,8,10,8,10,8,8,10,1,10,7,5,3,1,10,7,1,9,10,7,5,8,7,6,5,10,5,5,10,6,6,4,9,7,8,1,2,2,7,4,1,8,7,7,1,10,3,1,6,10,1,8,7,8,6,5,6,8,6,3,10,5,6,2,7,8,9,10,10,2,1,5,10,1,10,4,1,7,7,10,8,1,4,3,3,9,9,10,9,10,6,4,5,10,10,8,3,8,8,10,7,5,7,9,1,5,10,2,7,5,9,10,1,4,7,8,2,10,10,8,4,10,10,8,5,4,4,10,1,8,6,6,5,7,6,7,8,10,10,6,4,7,5,4,9,6,10,10,1,6,10,1,1,8,10,4,7,4,7,7,6,7,7,9,7,4,10,4,7,6,8,4,5,9,6,8,8,6,5,8,5,6,10,9,3,7,10,9,8,7,7,6,7,5,5,8,8,10,7,8,1,2,7,7,10,10,1,6,7,10,9,8,9,1,1,10,8,2,9,1,4,4,5,6,2,1,4,10,6,1,3,2,10,2,8,7,10,8,5,6,10,7,7,1,6,7,8,6,8,6,1,1,5,8,8,1,4,5,7,10,5,6,6,7,7,6,1,7,7,5,7,7,4,4,8,6,2,9,6,7,10,2,3,8,10,6,2,9,3,4,10,2,7,8,7,6,7,4,7,7,9,5,7,7,6,7,7,7,10,9,10,4,10,8,10,5,6,4,8,7,7,5,6,8,10,8,6,9,7,9,8,7,5,8,2,6,10,10,10,8,2,7,3,8,1,6,5,7,10,7,5,5,8,2,9,6,7,9,6,8,6,7,6,5,8,6,10,1,5,7,7,8,7,7,7,10,10,7,10,3,8,1,8,10,5,7,8,7,8,8,4,1,10,7,7,10,9,5,7,10,10,10,9,10,1,10,8,4,4,10,9,5,6,9,7,7,10,5,10,10,7,6,6,7,6,4,7,8,7,7,3,7,10,2,6,8,10,7,8,1,10,7,9,7,10,6,6,5,9,7,7,3,7,10,5,7,9,10,5,5,7,8,8,8,8,1,9,8,10,6,7,8,4,8,7,6,8,8,8,7,7,10,8,6,10,10,9,3,7,5,8,9,7,8,7,3,7,9,7,6,7,10,6,10,8,1,7,8,7,8,9,8,8,1,10,9,10,7,8,3,3,10,9,9,6,1,4,8,6,4,8,5,9,10,8,10,8,6,6,6,6,9,9,10,8,8,8,7,3,7,6,6,1,10,7,6,4,10,4,3,3,5,6,9,10,7,10,5,6,1,10,10,6,1,9,5,4,7,6,4,7,10,5,6,10,1,7,7,8,4,6,10,9,7,9,9,4,10,7,4,6,9,5,4,10,9,2,7,5,7,5,7,5,8,7,8,10,4,5,8,1,6,8,5,10,5,7,6,9,10,5,8,10,9,10,8,1,7,8,7,6,10,8,10,4,8,6,10,7,6,10,6,9,6,1,7,1,8,10,10,9,1,10,10,6,4,9,10,4,8,10,8,10,8,3,5,4,1,1,6,7,8,7,2,10,8,8,9,8,6,7,5,1,8,3,10,10,10,3,10,8,6,9,4,8,5,8,6,9,10,7,6,10,7,10,10,5,10,5,6,7,10,7,6,9,6,6,10,5,8,8,4,7,7,7,6,9,8,2,6,3,7,4,4,6,8,8,10,7,9,9,5,10,8,8,8,9,8,9,10,10,1,9,2,10,5,2,7,8,7,7,7,9,8,8,8,7,8,10,4,5,7,6,8,7,7,4,8,8,10,1,10,8,8,6,10,7,2,2,1,10,10,7,7,7,7,3,10,10,7,7,10,6,1,8,10,6,10,10,8,8,4,10,9,1,9,9,6,7,7,7,6,1,1,10,7,2,10,9,5,8,3,8,7,5,8,10,10,6,4,10,9,8,4,5,10,10,8,8,8,8,7,9,1,2,6,5,3,10,10,9,7,7,10,6,5,6,6,6,5,7,7,10,7,7,1,10,6,10,9,10,3,8,1,7,5,7,7,10,10,1,4,10,8,1,9,10,7,8,6,7,10,6,6,4,1,7,6,8,9,10,7,8,8,7,9,10,10,10,8,9,6,4,9,7,10,7,3,8,8,7,5,7,5,10,7,9,7,9,7,10,8,6,10,8,10,4,1,1,7,10,1,5,7,8,7,7,7,5,8,7,7,8,6,10,7,5,6,3,5,8,2,8,7,8,7,7,9,7,10,3,7,10,7,9,5,10,8,7,10,7,8,10,8,4,7,9,7,10,10,9,10,3,9,9,7,4,6,7,5,10,1,5,7,10,9,7,10,3,7,7,3,9,7,7,10,6,8,7,8,6,7,8,1,3,1,10,10,8,7,3,8,6,9,4,8,8,8,10,9,10,10,7,8,10,8,10,10,9,10,4,2,7,7,10,7,8,7,7,9,8,9,9,9,8,9,7,4,10,10,7,10,7,8,8,10,5,7,7,10,10,8,7,7,7,7,9,8,7,5,7,4,5,7,7,8,6,1,9,7,8,7,9,10,9,1,10,6,7,10,7,9,10,9,8,7,10,8,7,10,9,8,7,9,9,10,8,7,8,5,10,8,5,9,5]" height="191" width="384" class="chartjs-render-monitor" style="display: block; height: 213px; width: 427px;"></canvas>'

json_extract(html_str)

其中,当您从某个网站复制元素时,您可以找到实际平均值和贝叶斯平滑评分(每个类别加 1)。

在该网站上的几个系列上进行测试后,它们的平均值在某些方面很接近,而在其他方面则很不接近。

注意:该代码适用于任何数字列表。

我建议使用 XML 解析器来获取您需要的数据。然后您可以将其解析为 JSON 字符串。

import xml.etree.ElementTree as ET
import json

canvas = ET.fromstring(html_str)
if 'data-json' in canvas.attrib:
    data = json.loads(canvas.attrib['data-json'])