Wild Dataset 中的 SynthText 有多少字符数？

Question

我从 official 下载了 Wild Dataset 中的 SynthText。

然后，我看了官方的readme.txt，但是我找不到数据集有多少个字符。我用谷歌搜索但没找到...

正如您在下面的示例图片中看到的，存在一些符号，例如 .、: 和 -。所以，这个数据集有字母（27）+数字（10）+一些符号（？）。

有人知道吗？

Answer 1

我实现了自己的代码来计算符号。

def get_characters(basedir, imagedirname='SynthText', skip_missing=False):

    class Symbols:
        def __init__(self):
            self.symbols = set()

        def update(self, data):
            self.symbols = self.symbols.union(data)

        def __len__(self):
            return len(self.symbols)

        def __str__(self):
            return ''.join(self.symbols)

    symbols = Symbols()

    def csvgenerator(annodir, imagedir, cbb, wBB, imname, txts, symbols, **kwargs):
        image_num = kwargs.get('image_num')
        i = kwargs.get('i')

        imgpath = os.path.join(imagedir, imname)

        img = cv2.imread(imgpath)
        h, w, _ = img.shape
        if not os.path.exists(imgpath):
            if not skip_missing:
                raise FileNotFoundError('{} was not found'.format(imgpath))
            else:
                logging.warning('Missing image: {}'.format(imgpath))
                raise _Skip()


        # convert txts to list of str
        # I don't know why txts is
        # ['Lines:\nI lost\nKevin ', 'will                ', 'line\nand            ',
        # 'and\nthe             ', '(and                ', 'the\nout             ',
        # 'you                 ', "don't\n pkg          "]
        # there is strange blank and the length of txts is different from the one of wBB
        txts = ' '.join(txts.tolist()).split()
        text_num = len(txts)

        if wBB.ndim == 2:
            # convert shape=(2, 4,) to (2, 4, 1)
            wBB = np.expand_dims(wBB, 2)

        assert text_num == wBB.shape[2], 'The length of text and wordBB must be same, but got {} and {}'.format(
            text_num, wBB.shape[2])

        # replace non-alphanumeric characters with *
        alltexts_asterisk = ''.join([re.sub(r'[^A-Za-z0-9]', '*', text) for text in txts])
        assert len(alltexts_asterisk) == cbb.shape[
            2], 'The length of characters and cbb must be same, but got {} and {}'.format(
            len(alltexts_asterisk), cbb.shape[2])
        for b in range(text_num):
            text = txts[b]

            symboltext = re.sub(r'[A-Za-z0-9]+', '', text)

            symbols.update(symboltext)

        sys.stdout.write('\r{}, and number is {}...{:0.1f}% ({}/{})'.format(symbols, len(symbols), 100 * (float(i + 1) / image_num), i + 1, image_num))
        sys.stdout.flush()

    _gtmatRecognizer(csvgenerator, basedir, imagedirname, customLog=True, symbols=symbols)

    print()
    print('symbols are {}, and number is {}'.format(symbols, len(symbols)))


def _gtmatRecognizer(generator, basedir, imagedirname='SynthText', customLog=False, **kwargs):
    """
        convert gt.mat to https://github.com/MhLiao/TextBoxes_plusplus/blob/master/data/example.xml

        <annotation>
            <folder>train_images</folder>
            <filename>img_10.jpg</filename>
            <size>
                <width>1280</width>
                <height>720</height>
                <depth>3</depth>
            </size>
            <object>
                <difficult>1</difficult>
                <content>###</content>
                <name>text</name>
                <bndbox>
                    <x1>1011</x1>
                    <y1>157</y1>
                    <x2>1079</x2>
                    <y2>160</y2>
                    <x3>1076</x3>
                    <y3>173</y3>
                    <x4>1011</x4>
                    <y4>170</y4>
                    <xmin>1011</xmin>
                    <ymin>157</ymin>
                    <xmax>1079</xmax>
                    <ymax>173</ymax>
                </bndbox>
            </object>
            .
            .
            .

        </annotation>

        :param basedir: str, directory path under \'SynthText\'(, \'licence.txt\')
        :param imagedirname: (Optional) str, image directory name including \'gt.mat\
        :return:
        """
    logging.basicConfig(level=logging.INFO)

    imagedir = os.path.join(basedir, imagedirname)
    gtpath = os.path.join(imagedir, 'gt.mat')

    annodir = os.path.join(basedir, 'Annotations')

    if not os.path.exists(gtpath):
        raise FileNotFoundError('{} was not found'.format(gtpath))

    if not os.path.exists(annodir):
        # create Annotations directory
        os.mkdir(annodir)

    """
    ref: http://www.robots.ox.ac.uk/~vgg/data/scenetext/readme.txt
    gts = dict;
        __header__: bytes
        __version__: str
        __globals__: list
        charBB: object ndarray, shape = (1, image num). 
                Character level bounding box. shape = (2=(x,y), 4=(top left,...: clockwise), BBox word num)
        wordBB: object ndarray, shape = (1, image num). 
                Word level bounding box. shape = (2=(x,y), 4=(top left,...: clockwise), BBox char num)
        imnames: object ndarray, shape = (1, image num, 1).
        txt: object ndarray, shape = (i, image num).
             Text. shape = (word num)
    """
    logging.info('Loading {} now.\nIt may take a while.'.format(gtpath))
    gts = sio.loadmat(gtpath)
    logging.info('Loaded\n'.format(gtpath))

    charBB = gts['charBB'][0]
    wordBB = gts['wordBB'][0]
    imnames = gts['imnames'][0]
    texts = gts['txt'][0]

    image_num = imnames.size

    for i, (cbb, wBB, imname, txts) in enumerate(zip(charBB, wordBB, imnames, texts)):
        imname = imname[0]

        try:
            generator(annodir, imagedir, cbb, wBB, imname, txts, i=i, image_num=image_num, **kwargs)
        except _Skip:
            pass

        if not customLog:
            sys.stdout.write('\rGenerating... {:0.1f}% ({}/{})'.format(100 * (float(i + 1) / image_num), i + 1, image_num))
        sys.stdout.flush()


    print()
    logging.info('Finished!!!')

终于，我得到了符号编号。似乎 ASCII printable characters 没有 space.

INFO:root:Loading ~/data/text/SynthText/SynthText/gt.mat now.
It may take a while.
INFO:root:Loaded

}&|%_(],$^{+?#@/-`).<=;~['>:\!"*, and number is 32...100.0% (858750/858750)
INFO:root:Finished!!!

symbols are }&|%_(],$^{+?#@/-`).<=;~['>:\!"*, and number is 32

Wild Dataset 中的 SynthText 有多少字符数？

How many character number does SynthText in the Wild Dataset have?

dataset

text-recognition