当单词不存在时,将 0 分配给某些单词

Assign 0 to certain words when the words are not present

这是我第一次 post 在 Whosebug 上,我是不是对编码完全陌生。所以,请多多包涵。

我正在做一个有两组数据文档的实验。 Doc1如下:

TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427

TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464

TOPIC:topic_2 ....
.....
.....

TOPIC:topic_3 1066.0
say 0.062
word 0.182

依此类推直到 100 个主题。

在本文档中,有些词要么出现在所有主题中,要么只出现在少数主题中。所以,我想执行一个过程,如果一个词不存在于一个主题中,我希望该词在该主题中的值为 0。也就是说,BBC 这个词出现在主题 2 中,但不存在主题 1,所以我想将我的列表设为:

TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
Mr 0
s 0
president 0
tell 0
BBC 0

TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
president 0.0153647096879
tell 0.0135856380398
BBC 0.0135856380398
site 0
Internet 0
online 0
web 0
say 0
image 0

我必须将这些值与另一个文档中存在的另一组值相乘。为此,

from collections import defaultdict
from itertools import groupby, imap

d = defaultdict(list)
with open("doc1") as f,open("doc2") as f2:
values = map(float, f2.read().split()) 
for line in f:
    if line.strip() and not line.startswith("TOPIC"):
        name, val = line.split()
        d[name].append(float(val))

for k,v in d.items():
     print("Prob for {} is {}".format(k ,sum(i*j for i, j in zip(v,values)) ))

我的 doc2 格式为:

  0.566667 0.0333333 0.133333 0 0 0  2.43333 0 0.13333......... till 100 values. 

上面的代码考虑了单词 "say"。它检查单词是否在 3 个主题中,并将它们的值收集在一个列表中,如 [0.015, 0.45, 0.062]。此列表与 doc2 中的值相乘,因此值 0.015 乘以 doc2 中的第 0 个值、0.45 * doc2 中的第一个值和 0.062 * doc2 中的第二个值。但这不是我想要的。我们可以看到topic_2里面没有"SAY"这个词。这里的列表必须包含 [0.015, 0.45, 0, 0.062]。因此,当这些值与 doc2 中各自的位置值相乘时,它们将给出

P(SAY) = (0.566667*0.015) + (0.0333333*0.045) + (0.133333 *0) + (0*0.062)

因此,代码非常好,但只需要进行此修改。

问题是您将 TOPICS 视为一个整体,如果您希望各个部分使用原始答案中的 代码,首先获取一组所有名称,然后将这组名称与 defualtdict 进行比较查找每个部分差异的键:

from collections import defaultdict
d = defaultdict(float)
from itertools import groupby, imap

with open("doc1") as f,open("doc2") as f2:
    values = imap(float, f2.read().split())
    # find every word in every TOPIC
    all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
    f.seek(0) # rset pointer
    # lambda x: not(x.strip()) will split into groups on the empty lines
    for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
        if not k:
            topic = next(v)
            #  get matching float from values
            f = next(values)
            # iterate over the group
            for s in v:
                name, val = s.split()
                d[name] += (float(val) * f)
            # get difference in all_words vs words in current TOPIC
            # giving 0 as default for missing values
            for word in all_words - d.viewkeys():
                d[word] = 0
            for k,v in d.iteritems():
                print("Prob for {} is {}".format(k,v))
            d = defaultdict(float)

要存储所有输出,您可以将字典添加到列表中:

from collections import defaultdict
d = defaultdict(float)
from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
    values = imap(float, f2.read().split())
    all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
    f.seek(0)
    out = []
    # lambda x: not(x.strip()) will split into groups on the empty lines
    for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
        if not k:
            topic = next(v)
            #  get matching float from values
            f = next(values)
            # iterate over the group
            for s in v:
                name, val = s.split()
                d[name] += (float(val) * f)
            for word in all_words - d.viewkeys():
                d[word] = 0
            out.append(d)
            d = defaultdict(float)

然后遍历列表:

for top in out:
  for k,v in top.iteritems():
            print("Prob for {} is {}".format(k,v))

或者忘记 defualtdict 并使用 dict.fromkeys:

from itertools import groupby, imap

with open("doc1") as f,open("doc2") as f2:
    values = imap(float, f2.read().split())
    all_words = [line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")]
    f.seek(0)
    out, d = [], dict.fromkeys(all_words ,0.0)
    # lambda x: not(x.strip()) will split into groups on the empty lines
    for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
        if not k:
            topic = next(v)
            #  get matching float from values
            f = next(values)
            # iterate over the group
            for s in v:
                name, val = s.split()
                d[name] += (float(val) * f)
            out.append(d)
            d = dict.fromkeys(all_words ,0)

如果您总是希望最后缺少单词,请使用 collections.OrderedDict 第一种方法是在字典末尾添加缺少的值:

from collections import OrderedDict

from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2:
    values = imap(float, f2.read().split())
    all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
    f.seek(0)
    out = []
    # lambda x: not(x.strip()) will split into groups on the empty lines
    for  (k, v) in groupby(f, key=lambda x: not(x.strip())):
        if not k:
            topic = next(v)
            #  get matching float from values
            f = next(values)
            # iterate over the group
            for s in v:
                name, val = s.split()
                d.setdefault(name, (float(val) * f))
            for word in all_words.difference(d):
                    d[word] = 0
            out.append(d)
            d = OrderedDict()

for top in out:
    for k,v in top.iteritems():
         print("Prob for {} is {}".format(k,v))

最后按主题按顺序存储:

from collections import OrderedDict

from itertools import groupby, imap

with open("doc1") as f,open("doc2") as f2:
    values = imap(float, f2.read().split())
    all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
    f.seek(0)
    out = OrderedDict()
    # lambda x: not(x.strip()) will split into groups on the empty lines
    for (k, v) in groupby(f, key=lambda x: not(x.strip())):
        if not k:
            topic = next(v).rstrip()
            # create OrderedDict for each topic
            out[topic] = OrderedDict()
            #  get matching float from values
            f = next(values)
            # iterate over the group
            for s in v:
                name, val = s.split()
                out[topic].setdefault(name, (float(val) * f))
            # find words missing from TOPIC and  set to 0
            for word in  all_words.difference(out[topic]):
                    out[topic][word] = 0

for k,v in out.items():
    print(k) # each TOPIC
    for k,v in v.iteritems():
        print("Prob for {} is {}".format(k,v)) # the OrderedDict items
   print("\n")

文档 1:

TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427

TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
president 0.0153647096879
tell 0.0135856380398
BBC 0.0135856380398

doc2:

0.345 0.566667

输出:

TOPIC:topic_0 5892.0
Prob for site is 0.0128233197556
Prob for Internet is 0.00901731160895
Prob for online is 0.00790478615073
Prob for web is 0.00755346232181
Prob for say is 0.00550407331974
Prob for image is 0.00521130346231
Prob for BBC is 0
Prob for Mr is 0
Prob for s is 0
Prob for president is 0
Prob for tell is 0


TOPIC:topic_1 12366.0
Prob for Mr is 0.085187930859
Prob for s is 0.0293277438137
Prob for say is 0.0255701266375
Prob for president is 0.00870667394471
Prob for tell is 0.0076985327511
Prob for BBC is 0.0076985327511
Prob for web is 0
Prob for image is 0
Prob for online is 0
Prob for site is 0
Prob for Internet is 0

您可以使用常规的 for 循环应用完全相同的逻辑,groupby 只是为您完成所有分组工作。

如果你真的只是想写入一个文件,那么代码就更简单了:

from itertools import groupby, imap
with open("doc1") as f,open("doc2") as f2,open("prob.txt","w") as f3:
    values = imap(float, f2.read().split())
    all_words = {line.split()[0] for line in f if line.strip() and not line.startswith("TOPIC")}
    f.seek(0)
    for (k, v) in groupby(f, key=lambda x: not(x.strip())):
        if not k:
            topic, words  = next(v), []
            flt = next(values)
            f3.write(topic)    
            for s in v:
                name, val = s.split()
                words.append(name)
                f3.write("{} {}\n".format(name, (float(val) * flt)))
            for word in all_words.difference(words):
                  f3.write("{} {}\n".format(word, 0))
            f3.write("\n")

prob.txt:

TOPIC:topic_0 5892.0
site 0.0128233197556
Internet 0.00901731160895
online 0.00790478615073
web 0.00755346232181
say 0.00550407331974
image 0.00521130346231
BBC 0
Mr 0
s 0
president 0
tell 0

TOPIC:topic_1 12366.0
Mr 0.085187930859
s 0.0293277438137
say 0.0255701266375
president 0.00870667394471
tell 0.0076985327511
BBC 0.0076985327511
web 0
image 0
online 0
site 0
Internet 0

作为重写块的另一种简洁方法,您可以将所有名称存储在一个集合中,然后将块的相对 OrderedDict 打包,然后使用 set.difference 和主要单词(为每个块设置 words),然后将它们写在块的末尾:

from itertools import tee
from collections import OrderedDict

d=OrderedDict()
with open('input.txt') as f,open('new','w') as new:
    f2,f3,f=tee(f,3)
    next(f3)
    words={line.split()[0] for line in f if not line.startswith('TOPIC') and line.strip()}

    for line in f2:
        if line.startswith('TOPIC'):
           key=line
           next_line=next(f3)
           try:
               while not next_line.startswith('TOPIC'):
                  d.setdefault(key,[]).append(next_line)
                  next_line=next(f3)
           except:
                pass

    for k,v in d.items():
        block_words={line.split()[0] for line in v if line.strip()}
        insec=words.difference(block_words)
        new.writelines([k]+v+['{} {}\n'.format(i,0) for i in insec])

结果:

TOPIC:topic_0 5892.0
site 0.0371690427699
Internet 0.0261371350984
online 0.0229124236253
web 0.0218940936864
say 0.0159538357094
image 0.015105227427
president 0
s 0
BBC 0
tell 0
Mr 0
TOPIC:topic_1 12366.0
Mr 0.150331554262
s 0.0517548115801
say 0.0451237263464
president 0.0153647096879
tell 0.0135856380398
BBC 0.0135856380398web 0
image 0
online 0
site 0
Internet 0

我首先将 file1 读取为映射列表 {word, value},每个主题构建列表的一个元素。

with open('Doc1') as f:
    maps = []
    for line in f:
        line = line.strip()
        if line.startswith('TOPIC'):
            mapping = {}
            maps.append(mapping)
        elif len(line) == 0:
            pass
        else:
            k, v = line.split()
            mapping[k] = v

然后我将通过从所有映射中获取键的并集来构建一组所有单词

words = set()
for mapping in maps:
    words = words.union(mapping.keys())

然后我将迭代每个映射并为字典中尚未存在的单词集中的所有键添加一个 0 值。

for mapping in maps:
    for k in words.difference(mapping.keys()):
        mapping[k] = 0

那样的话,所有的词都存在于所有的映射中,构建一个漂亮的 d 字典就很简单了:

d = {k : list() for k in words }
for mapping in maps:
    for k in mappings:
        d[k].append(float(mapping[k]))

出现在至少一个主题中的每个单词都有一个包含 100 个值的列表,每个主题一个值,存在时为真值,不存在时为 0:zip 现在可以正常工作。