计算字母的转移矩阵

Calculate transition matrix of letters

我如何设法创建字母转换矩阵?

我有一个这样的字母列表:

[u'T', u'i', u'r', u's', u'd', u'a', u'g', u' ', u's', u'k', u'a', u'l', u' ', u'd', u'u', u' ', u'i', u'n', u's', u't', u'a', u'l', u'l', u'e', u'r', u'e', u' ', u'e', u'n', u' ', u'P', u'y', u't', u'h', u'o', u'n', u' ', u'f', u'o', u'r', u't', u'o', u'l', u'k', u'e', u'r', u',', u' ', u'o', u'g', u' ', u'l',u'P', u'l', u'a', u'n', u' ', u'f', u'o', u'r', u' ', u'u', u'g', u'e', u'n', u'D', u'e', u'n', u'n', u'e', u' ', u'u', u'g', u'e', u' ', u'd', u'r', u'e', u'j', u'e', u'r', u' ', u's', u'i', u'g', u' ', u'o', u'm', u' ', u'a', u't', u' ', u'k', u'o', u'm', u'm', u'e', u' ', u'i', u'g', u'a', u'n', u'g', u' ', u'm', u'e', u'd', u' ', u'P', u'y', u't', u'h', u'o', u'n', u'.', u' ', u' ', u'T', u'i', u'r', u's', u'd', u'a', u'g', u' ', u's', u'k', u'a', u'l', u' ', u'd', u'u', u' ', u'i', u'n', u's', u't', u'a', u'l', u'l', u'e', u'r', u'e', u' ', u'e', u'n', u' ', u'P', u'y', u't', u'h', u'o', u'n', u' ', u'f', u'o', u'r', u't', u'o', u'l', u'k', u'e', u'r', u',', u' ', u'o', u'g', u' ', u'l', u'b', u'r', u'e', u' ', u'd', u'e', u'n', u'n', u'e', u' ', u'a', u't', u' ', u'k', u'e', u'n', u'd', u'e', u' ', u'v', u'e', u'd', u' ', u'a', u't', u' ', u'k', u'b', u'r', u'e', u' ', u'n', u'o', u'g', u'l', u'e', u' ', u'p', u'r', u'o', u'g', u'r', u'a', u'm', u'm', u'e', u'r', u'.', u' ', u' ', u'I', u'P', u'y', u't', u'h', u'o', u'n', u' ', u'k', u'a', u'n', u' ', u'a', u'n', u'b', u'e', u'f', u'a', u'l', u'e', u's', u' ', u'd', u'a', u' ', u'd', u'e', u'n', u'n', u'e', u' ', u'f', u'i', u'n', u'd', u'e', u's', u' ', u't', u'i', u'l', u' ', u'L', u'i', u'n']

我如何根据这个字母列表创建一个转换矩阵? 我有来自 Python transition matrix 的以下代码:

 def tmatrix(self, lst):
        b = [[0 for _ in xrange(len(lst))] for _ in xrange(len(lst))]
        for (x,y), c in Counter(zip(lst, lst[1:])).iteritems():
            b[x-1][y-1] = c
        return b

但是我收到以下错误,因为我有一个 unicode 对象列表而不是 int。 TypeError: unsupported operand type(s) for -: 'unicode' and 'int'。我将如何转换代码以支持 unicode 对象?

您 link 的代码是使用整数对序列进行计数。然后可以很容易地将整数转换为转换矩阵中的索引(1 转换为索引 0,等等)。

您link使用的算法也仅适用于唯一元素,在那里构建的矩阵是 3 x 3,而不是 10 x 10。

您必须对输入列表执行相同的操作:

from collections import Counter, defaultdict
from itertools import count

def tmatrix(self, lst):
    # defaultdict that'll produce a unique index for each unique character
    # encountered in lst
    indices = defaultdict(count().next)
    unique_count = len(set(lst))
    b = [[0 for _ in xrange(unique_count)] for _ in xrange(unique_count)]
    for (x, y), c in Counter(zip(lst, lst[1:])).iteritems():
        b[indices[x]][indices[y]] = c
    return b

此处 indices 字典将字符映射回输入列表中的索引; itertools.count() instance 为字典中不存在的任何字符提供一个自动递增的整数值。

这会为您的输入样本生成一个 29 x 29 矩阵:

>>> tmatrix(None, sample)
[[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 2, 0, 0, 0, 2, 0, 0, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 2, 5, 0, 0, 0, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 3, 0, 2, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 2, 1, 0, 5, 0, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 1, 0, 6, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 3, 0, 3, 6, 4, 0, 2, 4, 2, 2, 1, 1, 2, 3, 0, 0, 3, 4, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1],
 [0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 0, 0, 0, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 2, 2, 0, 1, 7, 0, 0, 0, 3, 0, 3, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 6, 2, 2, 0, 0, 11, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 3, 0, 0, 0, 4, 0, 0, 2, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

您可能还想 return indices 映射,这样您就知道哪个字符映射到该矩阵中的哪个索引。

您可以将字符串配对(看起来它最初是丹麦语),然后使用 Counter 作为稀疏矩阵,以 (from, to) 作为键:

from collections import Counter
from itertools import tee, izip

data = 'Tirsdag skal du installere en Python fortolker, og lPlan for ugenDenne uge drejer sig om at komme igang med Python.  Tirsdag skal du installere en Python fortolker, og lbre denne at kende ved at kbre nogle programmer.  IPython kan anbefales da denne findes til Lin'
fst, snd = tee(data)
next(snd, '')
matrix = Counter(izip(fst, snd))

然后要获得 a->b 的转换,请使用 matrix['a', 'b'] 等...对于不存在的键,您将自动返回 0。如果您绝对想要 N x N 的二维数组,请使用@Martijn 的答案。

这是@Martijn Pieters 回答的有序版本:

from collections import Counter, defaultdict
from itertools import count
import numpy as np


def tmatrix(lst):
    """Sorted and normalised transition matrix
    """
    indices = defaultdict(count().next)
    b = np.zeros([len(set(lst)),len(set(lst))])

    Ct = Counter(zip(lst, lst[1:])) # zip together consecutive elements of the list

    for (x, y), c in iter(sorted(Ct.iteritems())): # make sorted iteration to generate sorted trasition matrix
    #print (x,y), c
    b[indices[x]][indices[y]] = float(c)

    res = dict((v,k) for k,v in indices.iteritems())

    b = np.array(b)

    # Normalise 
    for i in range(len(b)):
        b[i] = b[i]/float(b.sum(axis=1)[i])

    return b, indices