mean() 得到了一个意外的关键字参数 'dtype'!
mean() got an unexpected keyword argument 'dtype'!
我正在尝试使用 Intel Bigdl 实现图像分类。它使用 mnist 数据集进行分类。因为,我不想使用 mnist 数据集,所以我编写了如下替代方法:
图片Utils.py
from StringIO import StringIO
from PIL import Image
import numpy as np
from bigdl.util import common
from bigdl.dataset import mnist
from pyspark.mllib.stat import Statistics
def label_img(img):
word_label = img.split('.')[-2].split('/')[-1]
print word_label
# conversion to one-hot array [cat,dog]
# [much cat, no dog]
if "jobs" in word_label: return [1,0]
# [no cat, very doggo]
elif "zuckerberg" in word_label: return [0,1]
# target is start from 0,
def get_data(sc,path):
img_dir = path
train = sc.binaryFiles(img_dir + "/train")
test = sc.binaryFiles(img_dir+"/test")
image_to_array = lambda rawdata: np.asarray(Image.open(StringIO(rawdata)))
train_data = train.map(lambda x : (image_to_array(x[1]),np.array(label_img(x[0]))))
test_data = test.map(lambda x : (image_to_array(x[1]),np.array(label_img(x[0]))))
train_images = train_data.map(lambda x : x[0])
test_images = test_data.map((lambda x : x[0]))
train_labels = train_data.map(lambda x : x[1])
test_labels = test_data.map(lambda x : x[1])
training_mean = np.mean(train_images)
training_std = np.std(train_images)
rdd_train_images = sc.parallelize(train_images)
rdd_train_labels = sc.parallelize(train_labels)
rdd_test_images = sc.parallelize(test_images)
rdd_test_labels = sc.parallelize(test_labels)
rdd_train_sample = rdd_train_images.zip(rdd_train_labels).map(lambda (features, label):
common.Sample.from_ndarray(
(features - training_mean) / training_std,
label + 1))
rdd_test_sample = rdd_test_images.zip(rdd_test_labels).map(lambda (features, label):
common.Sample.from_ndarray(
(features - training_mean) / training_std,
label + 1))
return (rdd_train_sample, rdd_test_sample)
现在,当我尝试使用真实图像获取数据时,如下所示:
Classification.py
import pandas
import datetime as dt
from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from bigdl.dataset.transformer import *
from bigdl.dataset import mnist
from imageUtils import get_data
from StringIO import StringIO
from PIL import Image
import numpy as np
init_engine()
path = "/home/fusemachine/Hyper/person"
(train_data, test_data) = get_data(sc,path)
print train_data.count()
print test_data.count()
我收到以下错误
TypeError Traceback (most recent call >last)
in ()
2 # Get and store MNIST into RDD of Sample, please edit the "mnist_path" accordingly.
3 path = "/home/fusemachine/Hyper/person"
----> 4 (train_data, test_data) = get_data(sc,path)
5 print train_data.count()
6 print test_data.count()
/home/fusemachine/Downloads/dist-spark-2.1.0-scala-2.11.8-linux64-0.1.1-dist/imageUtils.py in get_data(sc, path)
31 test_labels = test_data.map(lambda x : x[1])
---> 33 training_mean = np.mean(train_images)
34 training_std = np.std(train_images)
35 rdd_train_images = sc.parallelize(train_images)
/opt/anaconda3/lib/python2.7/site-packages/numpy/core/fromnumeric.pyc 均值(a、轴、dtype、out、keepdims)
2884 pass
2885 else:
-> 2886 return mean(axis=axis, dtype=dtype, out=out, **kwargs)
2887
2888 return _methods._mean(a, axis=axis, dtype=dtype,
TypeError: mean() 得到了一个意外的关键字参数 'dtype'
我想不出解决办法。 mnist 数据集还有其他替代方案吗?这样我们就可以直接处理真实的Image了?
谢谢
train_images 是一个 rdd,你不能在 rdd 上应用 numpy mean。一种方法是执行 collect() 并在上面应用 numpy 意思,
train_images = train_data.map(lambda x : x[0]).collect()
training_mean = np.mean(train_images)
或rdd.mean()
training_mean = train_images.mean()
我正在尝试使用 Intel Bigdl 实现图像分类。它使用 mnist 数据集进行分类。因为,我不想使用 mnist 数据集,所以我编写了如下替代方法:
图片Utils.py
from StringIO import StringIO
from PIL import Image
import numpy as np
from bigdl.util import common
from bigdl.dataset import mnist
from pyspark.mllib.stat import Statistics
def label_img(img):
word_label = img.split('.')[-2].split('/')[-1]
print word_label
# conversion to one-hot array [cat,dog]
# [much cat, no dog]
if "jobs" in word_label: return [1,0]
# [no cat, very doggo]
elif "zuckerberg" in word_label: return [0,1]
# target is start from 0,
def get_data(sc,path):
img_dir = path
train = sc.binaryFiles(img_dir + "/train")
test = sc.binaryFiles(img_dir+"/test")
image_to_array = lambda rawdata: np.asarray(Image.open(StringIO(rawdata)))
train_data = train.map(lambda x : (image_to_array(x[1]),np.array(label_img(x[0]))))
test_data = test.map(lambda x : (image_to_array(x[1]),np.array(label_img(x[0]))))
train_images = train_data.map(lambda x : x[0])
test_images = test_data.map((lambda x : x[0]))
train_labels = train_data.map(lambda x : x[1])
test_labels = test_data.map(lambda x : x[1])
training_mean = np.mean(train_images)
training_std = np.std(train_images)
rdd_train_images = sc.parallelize(train_images)
rdd_train_labels = sc.parallelize(train_labels)
rdd_test_images = sc.parallelize(test_images)
rdd_test_labels = sc.parallelize(test_labels)
rdd_train_sample = rdd_train_images.zip(rdd_train_labels).map(lambda (features, label):
common.Sample.from_ndarray(
(features - training_mean) / training_std,
label + 1))
rdd_test_sample = rdd_test_images.zip(rdd_test_labels).map(lambda (features, label):
common.Sample.from_ndarray(
(features - training_mean) / training_std,
label + 1))
return (rdd_train_sample, rdd_test_sample)
现在,当我尝试使用真实图像获取数据时,如下所示:
Classification.py
import pandas
import datetime as dt
from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from bigdl.dataset.transformer import *
from bigdl.dataset import mnist
from imageUtils import get_data
from StringIO import StringIO
from PIL import Image
import numpy as np
init_engine()
path = "/home/fusemachine/Hyper/person"
(train_data, test_data) = get_data(sc,path)
print train_data.count()
print test_data.count()
我收到以下错误
TypeError Traceback (most recent call >last) in ()
2 # Get and store MNIST into RDD of Sample, please edit the "mnist_path" accordingly.
3 path = "/home/fusemachine/Hyper/person"
----> 4 (train_data, test_data) = get_data(sc,path)
5 print train_data.count()
6 print test_data.count()
/home/fusemachine/Downloads/dist-spark-2.1.0-scala-2.11.8-linux64-0.1.1-dist/imageUtils.py in get_data(sc, path)
31 test_labels = test_data.map(lambda x : x[1])
---> 33 training_mean = np.mean(train_images)
34 training_std = np.std(train_images)
35 rdd_train_images = sc.parallelize(train_images)
/opt/anaconda3/lib/python2.7/site-packages/numpy/core/fromnumeric.pyc 均值(a、轴、dtype、out、keepdims)
2884 pass
2885 else:
-> 2886 return mean(axis=axis, dtype=dtype, out=out, **kwargs)
2887
2888 return _methods._mean(a, axis=axis, dtype=dtype,
TypeError: mean() 得到了一个意外的关键字参数 'dtype'
我想不出解决办法。 mnist 数据集还有其他替代方案吗?这样我们就可以直接处理真实的Image了? 谢谢
train_images 是一个 rdd,你不能在 rdd 上应用 numpy mean。一种方法是执行 collect() 并在上面应用 numpy 意思,
train_images = train_data.map(lambda x : x[0]).collect()
training_mean = np.mean(train_images)
或rdd.mean()
training_mean = train_images.mean()