Python 等同于给数值直方图
Python equivalent fo hive NumericHistogram
我有一个 python(更具体地说 -- pypy 兼容)代码。我必须在其中使用直方图,如果不完全相同,它应该非常相似 this java histogram implementation in Hive project. Numpy has a histogram implementation。
有谁知道这两者在使用上是否等效,或者可以通过选择适当的参数值来实现?我可以通过阅读代码找到它,尽管在这里检查是否有人已经知道这个。
答案很长,TLDR 以粗体显示
您可以只编译和 运行 两者,而不是尝试理论工艺和猜测代码的作用。如有疑问,只需 运行 即可。通过一些工作,您可以获得 NumericHistogram.java
链接,无需 maven 即可编译,只需使用 CLI javac
调用(只需删除对 hive
包和相关方法的引用) .
我只是在阵列上测试了两者 [0,1,...,98,99]
.
Java 版本 (java 10.0.2):
编辑:收到反馈(通过电子邮件)以包含 Java 代码。在这里(删除了文档字符串和一些注释,并且没有包括所有 public 方法):
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.List;
import java.util.Collections;
import java.util.Random;
public class NumericHistogram {
static class Coord implements Comparable {
double x;
double y;
public int compareTo(Object other) {
return Double.compare(x, ((Coord) other).x);
}
};
// Class variables
private int nbins;
private int nusedbins;
private ArrayList<Coord> bins;
private Random prng;
public NumericHistogram() {
nbins = 0;
nusedbins = 0;
bins = null;
// init the RNG for breaking ties in histogram merging. A fixed seed is specified here
// to aid testing, but can be eliminated to use a time-based seed (which would
// make the algorithm non-deterministic).
prng = new Random(31183);
}
public void reset() {
bins = null;
nbins = nusedbins = 0;
}
public int getUsedBins() {
return nusedbins;
}
public boolean isReady() {
return nbins != 0;
}
public Coord getBin(int b) {
return bins.get(b);
}
public void allocate(int num_bins) {
nbins = num_bins;
bins = new ArrayList<Coord>();
nusedbins = 0;
}
public void add(double v) {
int bin = 0;
for(int l=0, r=nusedbins; l < r; ) {
bin = (l+r)/2;
if (bins.get(bin).x > v) {
r = bin;
} else {
if (bins.get(bin).x < v) {
l = ++bin;
} else {
break; // break loop on equal comparator
}
}
}
if (bin < nusedbins && bins.get(bin).x == v) {
bins.get(bin).y++;
} else {
Coord newBin = new Coord();
newBin.x = v;
newBin.y = 1;
bins.add(bin, newBin);
// Trim the bins down to the correct number of bins.
if (++nusedbins > nbins) {
trim();
}
}
}
private void trim() {
while(nusedbins > nbins) {
// Find the closest pair of bins in terms of x coordinates. Break ties randomly.
double smallestdiff = bins.get(1).x - bins.get(0).x;
int smallestdiffloc = 0, smallestdiffcount = 1;
for(int i = 1; i < nusedbins-1; i++) {
double diff = bins.get(i+1).x - bins.get(i).x;
if(diff < smallestdiff) {
smallestdiff = diff;
smallestdiffloc = i;
smallestdiffcount = 1;
} else {
if(diff == smallestdiff && prng.nextDouble() <= (1.0/++smallestdiffcount) ) {
smallestdiffloc = i;
}
}
}
double d = bins.get(smallestdiffloc).y + bins.get(smallestdiffloc+1).y;
Coord smallestdiffbin = bins.get(smallestdiffloc);
smallestdiffbin.x *= smallestdiffbin.y / d;
smallestdiffbin.x += bins.get(smallestdiffloc+1).x / d * bins.get(smallestdiffloc+1).y;
smallestdiffbin.y = d;
// Shift the remaining bins left one position
bins.remove(smallestdiffloc+1);
nusedbins--;
}
}
public int getNumBins() {
return bins == null ? 0 : bins.size();
}
}
我将这个 main 插入 NumericHistogram
class(见上文):
public static void main(String[] args) {
NumericHistogram hist = new NumericHistogram();
hist.allocate(10);
for (int i = 0; i < 100; i++) {
hist.add(i);
}
ArrayList<Coord> bins = new ArrayList<Coord>();
for (int i = 0; i < 10; i++) {
bins.add(hist.getBin(i));
}
int index = 0;
for (Coord bin : bins) {
System.out.println(index + "th bin x: " + bin.x);
System.out.println(index + "th bin y: " + bin.y);
index++;
}
}
这个输出:
Matthews-MacBook-Pro:Whosebug matt$ java NumericHistogram
0th bin x: 2.0
0th bin y: 5.0
1th bin x: 9.5
1th bin y: 10.0
2th bin x: 21.5
2th bin y: 14.0
3th bin x: 33.0
3th bin y: 9.0
4th bin x: 44.0
4th bin y: 13.0
5th bin x: 55.0
5th bin y: 9.0
6th bin x: 64.5
6th bin y: 10.0
7th bin x: 75.5
7th bin y: 12.0
8th bin x: 88.0
8th bin y: 13.0
9th bin x: 97.0
9th bin y: 5.0
Python版本:
numpy
版本有点不同。这是脚本:
import numpy as np
hist = np.histogram(np.arange(100)) # 10 is the default for num bins
print(hist)
这个输出:
(array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]), array([ 0. , 9.9, 19.8, 29.7, 39.6, 49.5, 59.4, 69.3, 79.2, 89.1, 99. ]))
与 java 版本相比,我们确实需要压缩它们,您可以这样做:
y_values, x_values = hist
i = 0
for x_val, y_val in zip(x_values, y_values):
print(str(i) + "th bin x: " + str(x_val))
print(str(i) + "th bin y: " + str(y_val))
i += 1
它输出:
0th bin x: 0.0
0th bin y: 10
1th bin x: 9.9
1th bin y: 10
2th bin x: 19.8
2th bin y: 10
3th bin x: 29.700000000000003
3th bin y: 10
4th bin x: 39.6
4th bin y: 10
5th bin x: 49.5
5th bin y: 10
6th bin x: 59.400000000000006
6th bin y: 10
7th bin x: 69.3
7th bin y: 10
8th bin x: 79.2
8th bin y: 10
9th bin x: 89.10000000000001
9th bin y: 10
汇总并匹配:
它们等价吗?不,它们不等价。 他们使用不同的分区方案为 bin 分配值。 Java 版本使用舍入规则将项目放入 "closest" 箱中,而 numpy
正在进行范围分组(即 0-9.9、9.9-19.8、...、89.1-100) .此外,它们生成的默认 bin 也不相同,这是有道理的,因为方案如此不同。
我可以让它们匹配吗?是的,但是如果你想要一个通用的实现,你必须弄清楚如何在 Python 领域中 准确地 生成随机数 Java 。.我尝试复制实现,并让它工作,但我无法获得相同的随机数。即使您使用 numpy
而不是将逻辑复制到 Python,您 仍然 必须能够生成 Java随机数,以使它们在一般情况下匹配。这个项目似乎相关,但它只是 Java 6 RNG,而不是 Java 的更高版本:https://github.com/MostAwesomeDude/java-random。冒着让这个巨大的答案变得更长的风险,这里是复制 Java 实现的 Python 代码:
""" Just a mindless copy for easy verification. Not good style or performant. """
import random
class Coord:
def __init__(self, x, y):
self.x = float(x)
self.y = float(y)
def __repr__(self): # debug
return "Coord(" + str(self.x) + ", " + str(self.y) + ")"
def __str__(self): # debug
return "Coord(" + str(self.x) + ", " + str(self.y) + ")"
class Random:
""" This class needs fixin. You'll have to do some work here to make it match your version of Java. """
def __init__(self, seed):
random.seed(seed)
def nextDouble(self):
return random.uniform(0, 1)
class NumericHistogram:
def __init__(self):
self.nbins = 0
self.nusedbins = 0
self.bins = None
self.prng = Random(31183) # This should behave the same as Java's RNG for your Java version.
def allocate(self, num_bins):
self.nbins = num_bins
self.bins = []
self.nusedbins = 0
def add(self, v):
bin = 0
l = 0
r = self.nusedbins
while(l < r):
bin = (l+r)//2
if self.bins[bin].x > v:
r = bin
else:
if self.bins[bin].x < v:
l = bin + 1; bin += 1
else:
break
if bin < self.nusedbins and self.bins[bin].x == v:
self.bins[bin].y += 1
else:
newBin = Coord(x=v, y=1)
if bin == len(self.bins):
self.bins.append(newBin)
else:
self.bins[bin] == newBin
self.nusedbins += 1
if (self.nusedbins > self.nbins):
self.trim()
def trim(self):
while self.nusedbins > self.nbins:
smallestdiff = self.bins[1].x - self.bins[0].x
smallestdiffloc = 0
smallestdiffcount = 1
for i in range(1, self.nusedbins-1):
diff = self.bins[i+1].x - self.bins[i].x
if diff < smallestdiff:
smallestdiff = diff
smallestdiffloc = i
smallestdiffcount = 1
else:
smallestdiffcount += 1
if diff == smallestdiff and self.prng.nextDouble() <= (1.0/smallestdiffcount):
smallestdiffloc = i
d = self.bins[smallestdiffloc].y + self.bins[smallestdiffloc+1].y
smallestdiffbin = self.bins[smallestdiffloc]
smallestdiffbin.x *= smallestdiffbin.y / d
smallestdiffbin.x += self.bins[smallestdiffloc+1].x / d * self.bins[smallestdiffloc+1].y
smallestdiffbin.y = d
self.bins.pop(smallestdiffloc+1)
self.nusedbins -= 1
和 "main":
hist = NumericHistogram()
hist.allocate(10)
for i in range(100):
hist.add(i)
for ind, bin in enumerate(hist.bins):
print(str(ind) + "th bin x: " + str(bin.x))
print(str(ind) + "th bin y: " + str(bin.y))
这个输出:
0th bin x: 4.0
0th bin y: 9.0
1th bin x: 13.0
1th bin y: 9.0
2th bin x: 24.0
2th bin y: 13.0
3th bin x: 35.0
3th bin y: 9.0
4th bin x: 46.5
4th bin y: 14.0
5th bin x: 57.5
5th bin y: 8.0
6th bin x: 66.0
6th bin y: 9.0
7th bin x: 76.49999999999999
7th bin y: 12.0
8th bin x: 88.99999999999999
8th bin y: 13.0
9th bin x: 97.5
9th bin y: 4.0
所以有点接近,但没有香蕉。区别在于 RNG(据我所知)。我不太了解 Java Random
或一般的 RNG:所以最好只 post 另一个 问题在这里关于如何生成随机数
HTH,可以让您朝着正确的方向开始。
我有一个 python(更具体地说 -- pypy 兼容)代码。我必须在其中使用直方图,如果不完全相同,它应该非常相似 this java histogram implementation in Hive project. Numpy has a histogram implementation。 有谁知道这两者在使用上是否等效,或者可以通过选择适当的参数值来实现?我可以通过阅读代码找到它,尽管在这里检查是否有人已经知道这个。
答案很长,TLDR 以粗体显示
您可以只编译和 运行 两者,而不是尝试理论工艺和猜测代码的作用。如有疑问,只需 运行 即可。通过一些工作,您可以获得 NumericHistogram.java
链接,无需 maven 即可编译,只需使用 CLI javac
调用(只需删除对 hive
包和相关方法的引用) .
我只是在阵列上测试了两者 [0,1,...,98,99]
.
Java 版本 (java 10.0.2):
编辑:收到反馈(通过电子邮件)以包含 Java 代码。在这里(删除了文档字符串和一些注释,并且没有包括所有 public 方法):
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.List;
import java.util.Collections;
import java.util.Random;
public class NumericHistogram {
static class Coord implements Comparable {
double x;
double y;
public int compareTo(Object other) {
return Double.compare(x, ((Coord) other).x);
}
};
// Class variables
private int nbins;
private int nusedbins;
private ArrayList<Coord> bins;
private Random prng;
public NumericHistogram() {
nbins = 0;
nusedbins = 0;
bins = null;
// init the RNG for breaking ties in histogram merging. A fixed seed is specified here
// to aid testing, but can be eliminated to use a time-based seed (which would
// make the algorithm non-deterministic).
prng = new Random(31183);
}
public void reset() {
bins = null;
nbins = nusedbins = 0;
}
public int getUsedBins() {
return nusedbins;
}
public boolean isReady() {
return nbins != 0;
}
public Coord getBin(int b) {
return bins.get(b);
}
public void allocate(int num_bins) {
nbins = num_bins;
bins = new ArrayList<Coord>();
nusedbins = 0;
}
public void add(double v) {
int bin = 0;
for(int l=0, r=nusedbins; l < r; ) {
bin = (l+r)/2;
if (bins.get(bin).x > v) {
r = bin;
} else {
if (bins.get(bin).x < v) {
l = ++bin;
} else {
break; // break loop on equal comparator
}
}
}
if (bin < nusedbins && bins.get(bin).x == v) {
bins.get(bin).y++;
} else {
Coord newBin = new Coord();
newBin.x = v;
newBin.y = 1;
bins.add(bin, newBin);
// Trim the bins down to the correct number of bins.
if (++nusedbins > nbins) {
trim();
}
}
}
private void trim() {
while(nusedbins > nbins) {
// Find the closest pair of bins in terms of x coordinates. Break ties randomly.
double smallestdiff = bins.get(1).x - bins.get(0).x;
int smallestdiffloc = 0, smallestdiffcount = 1;
for(int i = 1; i < nusedbins-1; i++) {
double diff = bins.get(i+1).x - bins.get(i).x;
if(diff < smallestdiff) {
smallestdiff = diff;
smallestdiffloc = i;
smallestdiffcount = 1;
} else {
if(diff == smallestdiff && prng.nextDouble() <= (1.0/++smallestdiffcount) ) {
smallestdiffloc = i;
}
}
}
double d = bins.get(smallestdiffloc).y + bins.get(smallestdiffloc+1).y;
Coord smallestdiffbin = bins.get(smallestdiffloc);
smallestdiffbin.x *= smallestdiffbin.y / d;
smallestdiffbin.x += bins.get(smallestdiffloc+1).x / d * bins.get(smallestdiffloc+1).y;
smallestdiffbin.y = d;
// Shift the remaining bins left one position
bins.remove(smallestdiffloc+1);
nusedbins--;
}
}
public int getNumBins() {
return bins == null ? 0 : bins.size();
}
}
我将这个 main 插入 NumericHistogram
class(见上文):
public static void main(String[] args) {
NumericHistogram hist = new NumericHistogram();
hist.allocate(10);
for (int i = 0; i < 100; i++) {
hist.add(i);
}
ArrayList<Coord> bins = new ArrayList<Coord>();
for (int i = 0; i < 10; i++) {
bins.add(hist.getBin(i));
}
int index = 0;
for (Coord bin : bins) {
System.out.println(index + "th bin x: " + bin.x);
System.out.println(index + "th bin y: " + bin.y);
index++;
}
}
这个输出:
Matthews-MacBook-Pro:Whosebug matt$ java NumericHistogram
0th bin x: 2.0
0th bin y: 5.0
1th bin x: 9.5
1th bin y: 10.0
2th bin x: 21.5
2th bin y: 14.0
3th bin x: 33.0
3th bin y: 9.0
4th bin x: 44.0
4th bin y: 13.0
5th bin x: 55.0
5th bin y: 9.0
6th bin x: 64.5
6th bin y: 10.0
7th bin x: 75.5
7th bin y: 12.0
8th bin x: 88.0
8th bin y: 13.0
9th bin x: 97.0
9th bin y: 5.0
Python版本:
numpy
版本有点不同。这是脚本:
import numpy as np
hist = np.histogram(np.arange(100)) # 10 is the default for num bins
print(hist)
这个输出:
(array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]), array([ 0. , 9.9, 19.8, 29.7, 39.6, 49.5, 59.4, 69.3, 79.2, 89.1, 99. ]))
与 java 版本相比,我们确实需要压缩它们,您可以这样做:
y_values, x_values = hist
i = 0
for x_val, y_val in zip(x_values, y_values):
print(str(i) + "th bin x: " + str(x_val))
print(str(i) + "th bin y: " + str(y_val))
i += 1
它输出:
0th bin x: 0.0
0th bin y: 10
1th bin x: 9.9
1th bin y: 10
2th bin x: 19.8
2th bin y: 10
3th bin x: 29.700000000000003
3th bin y: 10
4th bin x: 39.6
4th bin y: 10
5th bin x: 49.5
5th bin y: 10
6th bin x: 59.400000000000006
6th bin y: 10
7th bin x: 69.3
7th bin y: 10
8th bin x: 79.2
8th bin y: 10
9th bin x: 89.10000000000001
9th bin y: 10
汇总并匹配:
它们等价吗?不,它们不等价。 他们使用不同的分区方案为 bin 分配值。 Java 版本使用舍入规则将项目放入 "closest" 箱中,而 numpy
正在进行范围分组(即 0-9.9、9.9-19.8、...、89.1-100) .此外,它们生成的默认 bin 也不相同,这是有道理的,因为方案如此不同。
我可以让它们匹配吗?是的,但是如果你想要一个通用的实现,你必须弄清楚如何在 Python 领域中 准确地 生成随机数 Java 。.我尝试复制实现,并让它工作,但我无法获得相同的随机数。即使您使用 numpy
而不是将逻辑复制到 Python,您 仍然 必须能够生成 Java随机数,以使它们在一般情况下匹配。这个项目似乎相关,但它只是 Java 6 RNG,而不是 Java 的更高版本:https://github.com/MostAwesomeDude/java-random。冒着让这个巨大的答案变得更长的风险,这里是复制 Java 实现的 Python 代码:
""" Just a mindless copy for easy verification. Not good style or performant. """
import random
class Coord:
def __init__(self, x, y):
self.x = float(x)
self.y = float(y)
def __repr__(self): # debug
return "Coord(" + str(self.x) + ", " + str(self.y) + ")"
def __str__(self): # debug
return "Coord(" + str(self.x) + ", " + str(self.y) + ")"
class Random:
""" This class needs fixin. You'll have to do some work here to make it match your version of Java. """
def __init__(self, seed):
random.seed(seed)
def nextDouble(self):
return random.uniform(0, 1)
class NumericHistogram:
def __init__(self):
self.nbins = 0
self.nusedbins = 0
self.bins = None
self.prng = Random(31183) # This should behave the same as Java's RNG for your Java version.
def allocate(self, num_bins):
self.nbins = num_bins
self.bins = []
self.nusedbins = 0
def add(self, v):
bin = 0
l = 0
r = self.nusedbins
while(l < r):
bin = (l+r)//2
if self.bins[bin].x > v:
r = bin
else:
if self.bins[bin].x < v:
l = bin + 1; bin += 1
else:
break
if bin < self.nusedbins and self.bins[bin].x == v:
self.bins[bin].y += 1
else:
newBin = Coord(x=v, y=1)
if bin == len(self.bins):
self.bins.append(newBin)
else:
self.bins[bin] == newBin
self.nusedbins += 1
if (self.nusedbins > self.nbins):
self.trim()
def trim(self):
while self.nusedbins > self.nbins:
smallestdiff = self.bins[1].x - self.bins[0].x
smallestdiffloc = 0
smallestdiffcount = 1
for i in range(1, self.nusedbins-1):
diff = self.bins[i+1].x - self.bins[i].x
if diff < smallestdiff:
smallestdiff = diff
smallestdiffloc = i
smallestdiffcount = 1
else:
smallestdiffcount += 1
if diff == smallestdiff and self.prng.nextDouble() <= (1.0/smallestdiffcount):
smallestdiffloc = i
d = self.bins[smallestdiffloc].y + self.bins[smallestdiffloc+1].y
smallestdiffbin = self.bins[smallestdiffloc]
smallestdiffbin.x *= smallestdiffbin.y / d
smallestdiffbin.x += self.bins[smallestdiffloc+1].x / d * self.bins[smallestdiffloc+1].y
smallestdiffbin.y = d
self.bins.pop(smallestdiffloc+1)
self.nusedbins -= 1
和 "main":
hist = NumericHistogram()
hist.allocate(10)
for i in range(100):
hist.add(i)
for ind, bin in enumerate(hist.bins):
print(str(ind) + "th bin x: " + str(bin.x))
print(str(ind) + "th bin y: " + str(bin.y))
这个输出:
0th bin x: 4.0
0th bin y: 9.0
1th bin x: 13.0
1th bin y: 9.0
2th bin x: 24.0
2th bin y: 13.0
3th bin x: 35.0
3th bin y: 9.0
4th bin x: 46.5
4th bin y: 14.0
5th bin x: 57.5
5th bin y: 8.0
6th bin x: 66.0
6th bin y: 9.0
7th bin x: 76.49999999999999
7th bin y: 12.0
8th bin x: 88.99999999999999
8th bin y: 13.0
9th bin x: 97.5
9th bin y: 4.0
所以有点接近,但没有香蕉。区别在于 RNG(据我所知)。我不太了解 Java HTH,可以让您朝着正确的方向开始。Random
或一般的 RNG:所以最好只 post 另一个 问题在这里关于如何生成随机数