应该如何添加两个基于字典列表的 Python 数据集?
How should two Python datasets that are based on a list of dictionaries be added?
我定义了一个数据集 class,它有一个事件列表,并且对于每个事件,都有一个变量值字典。数据集可用于使用索引轻松访问事件,可用于打乱事件,可用于预处理所有数据(重新缩放等),并可用于轻松地以 ASCII [=30] 格式呈现数据=] 在终端中。
我现在希望能够添加 数据集。这意味着我希望能够合并两个数据集,这两个数据集对于每个事件都有相同的变量集。我尝试过这样做,但是当我尝试将一个数据集添加到另一个数据集时,列以某种方式搞砸了。
所以,假设我有两个以下列方式开始的数据集:
|-----------------------------------------------------------------------------------------------------------|
|index|jet_2|nJets|jet_2|class|el_1_|jet_1|jet_2|nBTag|jet_1|met |jet_2|el_1_|jet_1|el_1_|Centr|jet_1|met_p|
| |_phi | |_e | |phi |_phi |_pt |s |_pt | |_eta |pt |_e |eta |ality|_eta |hi |
| | | | | | | | | | | | | | | |_all | | |
|-----------------------------------------------------------------------------------------------------------|
|1 |-2.64|5 |10827|0 |-1.20|0.636|10787|1 |18888|64427|-0.02|77078|19106|-0.99|0.905|-0.01|2.521|
| |34259| |3.75 | |04392|89154|1.367| |1.765|.3164|75653|.5546|0.656|58823|95465|71776|60954|
| |4147 | | | |1471 |3865 |188 | |625 |062 |58206|875 |25 |9193 |8985 |15314|475 |
| | | | | | | | | | | |6 | | | | |7 | |
|-----------------------------------------------------------------------------------------------------------|
|2 |0.034|4 |63592|0 |2.591|-0.59|33142|2 |91993|23694|-1.26|91370|16520|-0.83|0.608|-1.18|-2.26|
| |82930| |.6328| |75348|62130|.2421| |.4843|.5468|29560|.1406|1.25 |01038|56819|74392|86488|
| |36222| |125 | |282 |42736|875 | |75 |75 |2322 |25 | |74207|1528 |0326 |6284 |
|-----------------------------------------------------------------------------------------------------------|
|-----------------------------------------------------------------------------------------------------------|
|index|jet_2|nJets|jet_2|class|el_1_|jet_1|jet_2|nBTag|jet_1|met |jet_2|el_1_|jet_1|el_1_|Centr|jet_1|met_p|
| |_phi | |_e | |phi |_phi |_pt |s |_pt | |_eta |pt |_e |eta |ality|_eta |hi |
| | | | | | | | | | | | | | | |_all | | |
|-----------------------------------------------------------------------------------------------------------|
|1 |1.395|5 |16180|1 |-1.09|-1.73|76499|2 |11393|82580|-1.37|41392|11862|0.469|0.562|0.270|1.562|
| |24650| |8.828| |14194|03662|.0625| |7.421|.4062|69830|.0585|4.257|71315|93624|53004|49403|
| |574 | |125 | |5839 |3001 | | |875 |5 |4653 |938 |812 |1455 |6395 |5033 |954 |
|-----------------------------------------------------------------------------------------------------------|
|2 |1.376|7 |16091|1 |0.547|-2.22|13746|2 |23603|58867|0.560|11221|33601|0.371|0.763|0.886|-0.62|
| |49536| |5.296| |91748|10381|2.937| |9.062|.9648|20069|8.468|2.562|41576|15855|64764|06557|
| |133 | |875 | |5237 |031 |5 | |5 |438 |1223 |75 |5 |4093 |9799 |1659 |7507 |
|-----------------------------------------------------------------------------------------------------------|
如果我尝试添加这些数据集,我会得到以下信息:
|-----------------------------------------------------------------------------------------------------------|
|index|el_1_|nJets|jet_2|jet_2|jet_1|jet_2|nBTag|jet_1|met |jet_2|jet_1|el_1_|jet_1|el_1_|class|met_p|Centr|
| |phi | |_e |_phi |_phi |_pt |s |_pt | |_eta |_eta |pt |_e |eta | |hi |ality|
| | | | | | | | | | | | | | | | | |_all |
|-----------------------------------------------------------------------------------------------------------|
|1 |-1.20|5 |10827|-2.64|0.636|10787|1 |18888|64427|-0.02|-0.01|77078|19106|-0.99|0 |2.521|0.905|
| |04392| |3.75 |34259|89154|1.367| |1.765|.3164|75653|71776|.5546|0.656|58823| |60954|95465|
| |1471 | | |4147 |3865 |188 | |625 |062 |58206|15314|875 |25 |9193 | |475 |8985 |
| | | | | | | | | | |6 |7 | | | | | | |
|-----------------------------------------------------------------------------------------------------------|
|2 |0.034|4 |63592|0 |2.591|-0.59|33142|2 |91993|23694|-1.26|91370|16520|-0.83|0.608|-1.18|-2.26|
| |82930| |.6328| |75348|62130|.2421| |.4843|.5468|29560|.1406|1.25 |01038|56819|74392|86488|
| |36222| |125 | |282 |42736|875 | |75 |75 |2322 |25 | |74207|1528 |0326 |6284 |
|-----------------------------------------------------------------------------------------------------------|
您可以看到列的排序发生了一些奇怪的事情。谁能看出这是为什么?你能看到如何修复它吗?
数据集class如下:
class Dataset(object):
def __init__(
self
):
self._index = 0
self._data = {}
def index(
number = None
):
if number is not None:
self._index = number
return self._index
def indices(
self
):
return [index for index in self._data]
def variable(
self,
index = None,
name = None,
value = None
):
if index is not None:
self._index = index
if name is not None:
if value is not None:
try:
self._data[self._index][name] = value
except:
self._data[self._index] = {}
self._data[self._index][name] = value
return self._data[self._index][name]
def variables(
self,
index = 0
):
return [
variable for variable, value in self._data[self._index].iteritems()
]
def values(
self,
name = None
):
return [self._data[index][name] for index in self.indices()]
def table(
self
):
table_contents = ["index"]
table_contents.extend(self.variables())
table_contents = [table_contents]
for index in self.indices():
values = [
str(self.variable(
name = name,
index = index
)) for name in self.variables()]
row = [str(index)]
row.extend(values)
table_contents.append(row)
return pyprel.Table(
contents = table_contents
)
def normalize(
self,
name = None,
summation = None
):
values_raw = self.values(name = name)
values_normalized = normalize(
values_raw,
summation = summation
)
for index_normalized, index in enumerate(self.indices()):
self.variable(
index = index,
name = name,
value = values_normalized[index_normalized]
)
def normalize_all(
self
):
for name in self.variables():
self.normalize(name = name)
def preprocess(
self,
name = None
):
from sklearn import preprocessing
values_raw = self.values(name = name)
values_preprocessed = list(preprocessing.scale(values_raw))
for index_preprocessed, index in enumerate(self.indices()):
self.variable(
index = index,
name = name,
value = values_preprocessed[index_preprocessed]
)
def preprocess_all(
self,
skip_variables = ["class"]
):
for name in self.variables():
if name not in skip_variables:
self.preprocess(name = name)
def shuffle(
self,
name = None,
seed = 100
):
random.seed(seed)
values = self.values(name = name)
random.shuffle(values)
for index_shuffled, index in enumerate(self.indices()):
self.variable(
index = index,
name = name,
value = values[index_shuffled]
)
def shuffle_all(
self,
seed = 100
):
for name in self.variables():
self.shuffle(
name = name,
seed = seed
)
def add(
self,
dataset = None
):
index_current_maximum = max(self.indices())
for index_offset, index in enumerate(dataset.indices()):
for name in dataset.variables():
self.variable(
index = index_current_maximum + 1 + index_offset,
name = name,
value = dataset.variable(
index = index,
name = name
)
)
在 dict
次迭代中无法保证顺序。如果您希望按特定顺序排列密钥,则必须强制执行。您可以通过在对象定义中包含变量名称列表来实现。或者,如果始终按字母顺序排列就足够了,您可以
def variables(self, index = 0):
varsrc = self._data[index]
return sorted(varsrc.keys())
(请注意,我实际上使用了传递的索引,您的代码没有。)
作为一个单独的问题,如果您想从 dict
中提取键,传递 dict
比传递允许您找到 [=11] 的索引更有意义=].
我定义了一个数据集 class,它有一个事件列表,并且对于每个事件,都有一个变量值字典。数据集可用于使用索引轻松访问事件,可用于打乱事件,可用于预处理所有数据(重新缩放等),并可用于轻松地以 ASCII [=30] 格式呈现数据=] 在终端中。
我现在希望能够添加 数据集。这意味着我希望能够合并两个数据集,这两个数据集对于每个事件都有相同的变量集。我尝试过这样做,但是当我尝试将一个数据集添加到另一个数据集时,列以某种方式搞砸了。
所以,假设我有两个以下列方式开始的数据集:
|-----------------------------------------------------------------------------------------------------------|
|index|jet_2|nJets|jet_2|class|el_1_|jet_1|jet_2|nBTag|jet_1|met |jet_2|el_1_|jet_1|el_1_|Centr|jet_1|met_p|
| |_phi | |_e | |phi |_phi |_pt |s |_pt | |_eta |pt |_e |eta |ality|_eta |hi |
| | | | | | | | | | | | | | | |_all | | |
|-----------------------------------------------------------------------------------------------------------|
|1 |-2.64|5 |10827|0 |-1.20|0.636|10787|1 |18888|64427|-0.02|77078|19106|-0.99|0.905|-0.01|2.521|
| |34259| |3.75 | |04392|89154|1.367| |1.765|.3164|75653|.5546|0.656|58823|95465|71776|60954|
| |4147 | | | |1471 |3865 |188 | |625 |062 |58206|875 |25 |9193 |8985 |15314|475 |
| | | | | | | | | | | |6 | | | | |7 | |
|-----------------------------------------------------------------------------------------------------------|
|2 |0.034|4 |63592|0 |2.591|-0.59|33142|2 |91993|23694|-1.26|91370|16520|-0.83|0.608|-1.18|-2.26|
| |82930| |.6328| |75348|62130|.2421| |.4843|.5468|29560|.1406|1.25 |01038|56819|74392|86488|
| |36222| |125 | |282 |42736|875 | |75 |75 |2322 |25 | |74207|1528 |0326 |6284 |
|-----------------------------------------------------------------------------------------------------------|
|-----------------------------------------------------------------------------------------------------------|
|index|jet_2|nJets|jet_2|class|el_1_|jet_1|jet_2|nBTag|jet_1|met |jet_2|el_1_|jet_1|el_1_|Centr|jet_1|met_p|
| |_phi | |_e | |phi |_phi |_pt |s |_pt | |_eta |pt |_e |eta |ality|_eta |hi |
| | | | | | | | | | | | | | | |_all | | |
|-----------------------------------------------------------------------------------------------------------|
|1 |1.395|5 |16180|1 |-1.09|-1.73|76499|2 |11393|82580|-1.37|41392|11862|0.469|0.562|0.270|1.562|
| |24650| |8.828| |14194|03662|.0625| |7.421|.4062|69830|.0585|4.257|71315|93624|53004|49403|
| |574 | |125 | |5839 |3001 | | |875 |5 |4653 |938 |812 |1455 |6395 |5033 |954 |
|-----------------------------------------------------------------------------------------------------------|
|2 |1.376|7 |16091|1 |0.547|-2.22|13746|2 |23603|58867|0.560|11221|33601|0.371|0.763|0.886|-0.62|
| |49536| |5.296| |91748|10381|2.937| |9.062|.9648|20069|8.468|2.562|41576|15855|64764|06557|
| |133 | |875 | |5237 |031 |5 | |5 |438 |1223 |75 |5 |4093 |9799 |1659 |7507 |
|-----------------------------------------------------------------------------------------------------------|
如果我尝试添加这些数据集,我会得到以下信息:
|-----------------------------------------------------------------------------------------------------------|
|index|el_1_|nJets|jet_2|jet_2|jet_1|jet_2|nBTag|jet_1|met |jet_2|jet_1|el_1_|jet_1|el_1_|class|met_p|Centr|
| |phi | |_e |_phi |_phi |_pt |s |_pt | |_eta |_eta |pt |_e |eta | |hi |ality|
| | | | | | | | | | | | | | | | | |_all |
|-----------------------------------------------------------------------------------------------------------|
|1 |-1.20|5 |10827|-2.64|0.636|10787|1 |18888|64427|-0.02|-0.01|77078|19106|-0.99|0 |2.521|0.905|
| |04392| |3.75 |34259|89154|1.367| |1.765|.3164|75653|71776|.5546|0.656|58823| |60954|95465|
| |1471 | | |4147 |3865 |188 | |625 |062 |58206|15314|875 |25 |9193 | |475 |8985 |
| | | | | | | | | | |6 |7 | | | | | | |
|-----------------------------------------------------------------------------------------------------------|
|2 |0.034|4 |63592|0 |2.591|-0.59|33142|2 |91993|23694|-1.26|91370|16520|-0.83|0.608|-1.18|-2.26|
| |82930| |.6328| |75348|62130|.2421| |.4843|.5468|29560|.1406|1.25 |01038|56819|74392|86488|
| |36222| |125 | |282 |42736|875 | |75 |75 |2322 |25 | |74207|1528 |0326 |6284 |
|-----------------------------------------------------------------------------------------------------------|
您可以看到列的排序发生了一些奇怪的事情。谁能看出这是为什么?你能看到如何修复它吗?
数据集class如下:
class Dataset(object):
def __init__(
self
):
self._index = 0
self._data = {}
def index(
number = None
):
if number is not None:
self._index = number
return self._index
def indices(
self
):
return [index for index in self._data]
def variable(
self,
index = None,
name = None,
value = None
):
if index is not None:
self._index = index
if name is not None:
if value is not None:
try:
self._data[self._index][name] = value
except:
self._data[self._index] = {}
self._data[self._index][name] = value
return self._data[self._index][name]
def variables(
self,
index = 0
):
return [
variable for variable, value in self._data[self._index].iteritems()
]
def values(
self,
name = None
):
return [self._data[index][name] for index in self.indices()]
def table(
self
):
table_contents = ["index"]
table_contents.extend(self.variables())
table_contents = [table_contents]
for index in self.indices():
values = [
str(self.variable(
name = name,
index = index
)) for name in self.variables()]
row = [str(index)]
row.extend(values)
table_contents.append(row)
return pyprel.Table(
contents = table_contents
)
def normalize(
self,
name = None,
summation = None
):
values_raw = self.values(name = name)
values_normalized = normalize(
values_raw,
summation = summation
)
for index_normalized, index in enumerate(self.indices()):
self.variable(
index = index,
name = name,
value = values_normalized[index_normalized]
)
def normalize_all(
self
):
for name in self.variables():
self.normalize(name = name)
def preprocess(
self,
name = None
):
from sklearn import preprocessing
values_raw = self.values(name = name)
values_preprocessed = list(preprocessing.scale(values_raw))
for index_preprocessed, index in enumerate(self.indices()):
self.variable(
index = index,
name = name,
value = values_preprocessed[index_preprocessed]
)
def preprocess_all(
self,
skip_variables = ["class"]
):
for name in self.variables():
if name not in skip_variables:
self.preprocess(name = name)
def shuffle(
self,
name = None,
seed = 100
):
random.seed(seed)
values = self.values(name = name)
random.shuffle(values)
for index_shuffled, index in enumerate(self.indices()):
self.variable(
index = index,
name = name,
value = values[index_shuffled]
)
def shuffle_all(
self,
seed = 100
):
for name in self.variables():
self.shuffle(
name = name,
seed = seed
)
def add(
self,
dataset = None
):
index_current_maximum = max(self.indices())
for index_offset, index in enumerate(dataset.indices()):
for name in dataset.variables():
self.variable(
index = index_current_maximum + 1 + index_offset,
name = name,
value = dataset.variable(
index = index,
name = name
)
)
在 dict
次迭代中无法保证顺序。如果您希望按特定顺序排列密钥,则必须强制执行。您可以通过在对象定义中包含变量名称列表来实现。或者,如果始终按字母顺序排列就足够了,您可以
def variables(self, index = 0):
varsrc = self._data[index]
return sorted(varsrc.keys())
(请注意,我实际上使用了传递的索引,您的代码没有。)
作为一个单独的问题,如果您想从 dict
中提取键,传递 dict
比传递允许您找到 [=11] 的索引更有意义=].