我需要帮助从我自己的图像集创建 mnist.pkl.gz
I need help to create mnist.pkl.gz from my own set of images
我是 python 和机器学习的新手。
我成功地测试了来自深度学习的 DBN.py 个例子。现在我想把我自己的一组图片变成 mnist.pkl.gz 格式
我已经尝试了一些来自名为 JPG-PNG-to-MNIST-NN-Format 的项目的代码
在 github 但它给了我 idx 格式
我使用了一些代码将此 idx 格式转换为 mnist.pkl 但我发现应该有一个 validation_set 图像,它没有以 JPG-PNG-to-MNIST-NN-Format 和我的 DBN.py 代码给我错误 "ran out of input"
我什至试过这个
How to put my dataset in a .pkl file in the exact format and data structure used in "mnist.pkl.gz"?
但我不知道如何准备 *.csv 标签。这是我的代码
from PIL import Image
from numpy import genfromtxt
import gzip, cPickle
from glob import glob
import numpy as np
import pandas as pd
def dir_to_dataset(glob_files, loc_train_labels=""):
print("Gonna process:\n\t %s"%glob_files)
dataset = []
for file_count, file_name in enumerate( sorted(glob(glob_files),key=len) ):
image = Image.open(file_name)
img = Image.open(file_name).convert('LA') #tograyscale
pixels = [f[0] for f in list(img.getdata())]
dataset.append(pixels)
if file_count % 1000 == 0:
print("\t %s files processed"%file_count)
# outfile = glob_files+"out"
# np.save(outfile, dataset)
if len(loc_train_labels) > 0:
df = pd.read_csv(loc_train_labels)
return np.array(dataset), np.array(df["class"])
else:
return np.array(dataset)
Data1, y1 = dir_to_dataset("train\*.png","train.csv")
Data2, y2 = dir_to_dataset("valid\*.png","valid.csv")
Data3, y3 = dir_to_dataset("test\*.png","test.csv")
# Data and labels are read
train_set_x = Data1[:7717]
train_set_y = y1[:7717]
val_set_x = Data2[:1653]
val_set_y = y2[:1653]
test_set_x = Data3[:1654]
test_set_y = y3[:1654]
# Divided dataset into 3 parts. I had 6281 images.
train_set = train_set_x, train_set_y
val_set = val_set_x, val_set_y
test_set = test_set_x, val_set_y
dataset = [train_set, val_set, test_set]
f = gzip.open('mnist.pkl.gz','wb')
cPickle.dump(dataset, f, protocol=2)
f.close()
但我收到这些错误
Gonna process:
train\*.png
Traceback (most recent call last):
File "to-mnist.py", line 27, in <module>
Data1, y1 = dir_to_dataset("train\*.png","train.csv")
File "to-mnist.py", line 22, in dir_to_dataset
return np.array(dataset), np.array(df["class"])
File "/home/alireza/.local/lib/python2.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "/home/alireza/.local/lib/python2.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'class'
我认为这与我的 *.csv 文件有关。 *.csv 文件是普通的 txt 文档,其中 class 为 0 和 1。像这样
0
0
0
0
0
0
1
1
1
1
您应该将列名添加到您的数据框中。
将 "to-mnist.py" 第 21 行更改为这样。
df = pd.read_csv(loc_train_labels, names = ["class"])
非常感谢您回答我的问题。
我在 GitHub 上做了一个项目,并将我所有的数据都放在里面,为像我这样刚开始深度学习的人创建了 mnist.pkl.gz 数据集。
你可以在这里找到
https://github.com/tikroute/mnist.pkl.gz-dataset-creator
希望对这个领域的其他同学有所帮助:)
我是 python 和机器学习的新手。 我成功地测试了来自深度学习的 DBN.py 个例子。现在我想把我自己的一组图片变成 mnist.pkl.gz 格式
我已经尝试了一些来自名为 JPG-PNG-to-MNIST-NN-Format 的项目的代码 在 github 但它给了我 idx 格式 我使用了一些代码将此 idx 格式转换为 mnist.pkl 但我发现应该有一个 validation_set 图像,它没有以 JPG-PNG-to-MNIST-NN-Format 和我的 DBN.py 代码给我错误 "ran out of input" 我什至试过这个 How to put my dataset in a .pkl file in the exact format and data structure used in "mnist.pkl.gz"? 但我不知道如何准备 *.csv 标签。这是我的代码
from PIL import Image
from numpy import genfromtxt
import gzip, cPickle
from glob import glob
import numpy as np
import pandas as pd
def dir_to_dataset(glob_files, loc_train_labels=""):
print("Gonna process:\n\t %s"%glob_files)
dataset = []
for file_count, file_name in enumerate( sorted(glob(glob_files),key=len) ):
image = Image.open(file_name)
img = Image.open(file_name).convert('LA') #tograyscale
pixels = [f[0] for f in list(img.getdata())]
dataset.append(pixels)
if file_count % 1000 == 0:
print("\t %s files processed"%file_count)
# outfile = glob_files+"out"
# np.save(outfile, dataset)
if len(loc_train_labels) > 0:
df = pd.read_csv(loc_train_labels)
return np.array(dataset), np.array(df["class"])
else:
return np.array(dataset)
Data1, y1 = dir_to_dataset("train\*.png","train.csv")
Data2, y2 = dir_to_dataset("valid\*.png","valid.csv")
Data3, y3 = dir_to_dataset("test\*.png","test.csv")
# Data and labels are read
train_set_x = Data1[:7717]
train_set_y = y1[:7717]
val_set_x = Data2[:1653]
val_set_y = y2[:1653]
test_set_x = Data3[:1654]
test_set_y = y3[:1654]
# Divided dataset into 3 parts. I had 6281 images.
train_set = train_set_x, train_set_y
val_set = val_set_x, val_set_y
test_set = test_set_x, val_set_y
dataset = [train_set, val_set, test_set]
f = gzip.open('mnist.pkl.gz','wb')
cPickle.dump(dataset, f, protocol=2)
f.close()
但我收到这些错误
Gonna process:
train\*.png
Traceback (most recent call last):
File "to-mnist.py", line 27, in <module>
Data1, y1 = dir_to_dataset("train\*.png","train.csv")
File "to-mnist.py", line 22, in dir_to_dataset
return np.array(dataset), np.array(df["class"])
File "/home/alireza/.local/lib/python2.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "/home/alireza/.local/lib/python2.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'class'
我认为这与我的 *.csv 文件有关。 *.csv 文件是普通的 txt 文档,其中 class 为 0 和 1。像这样
0
0
0
0
0
0
1
1
1
1
您应该将列名添加到您的数据框中。 将 "to-mnist.py" 第 21 行更改为这样。
df = pd.read_csv(loc_train_labels, names = ["class"])
非常感谢您回答我的问题。 我在 GitHub 上做了一个项目,并将我所有的数据都放在里面,为像我这样刚开始深度学习的人创建了 mnist.pkl.gz 数据集。
你可以在这里找到 https://github.com/tikroute/mnist.pkl.gz-dataset-creator
希望对这个领域的其他同学有所帮助:)