error when comparing files : IndexError: list index out of range

Question

我有一个程序可以逐行比较文件并通过读取两个文件夹 "gold folder" 和一个“预测文件夹”来计算精度。

解压出来的文件是这样的：

T1  Task 5 19   nonlinear wave
T2  Task 5 29   nonlinear wave equations
T3  Task 15 29  wave equations
T4  Task 86 111 general analytical method
T5  Task 94 111 analytical method
T6  Task 199 213    minimum stages
T7  Task 268 287    efficient technique
T8  Task 268 298    efficient technique relatingto

还有黄金档案：

T1  Process 5 14    oxidation
T2  Material 69 84  Ti-based alloys
T3  Material 186 192    alloys
T4  Task 264 349    understand the role that composition has on the oxidation behavior of Ti-based alloys
T5  Process 312 321 oxidation
T6  Material 334 349    Ti-based alloys
T7  Material 400 415    Ti-based alloys
T8  Material 445 451    alloys
T9  Process 480 489 oxidation

问题是这段代码产生了这个错误：

Traceback (most recent call last):
  File "C:\Users\chedi\Downloads\Semeval\eval.py", line 214, in <module>
    calculateMeasures(folder_gold, folder_pred, remove_anno)
  File "C:\Users\chedi\Downloads\Semeval\eval.py", line 31, in calculateMeasures
    res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
  File "C:\Users\chedi\Downloads\Semeval\eval.py", line 130, in normaliseAnnotations
    r_g_offs = r_g[1].split(" ")
IndexError: list index out of range

错误出现在第 130 行和提取文件的格式中，但它们的格式似乎相同：第一列和第二列由制表符分隔，偏移量 space

    #!/usr/bin/python
# by Mattew Peters, who spotted that sklearn does macro averaging not micro averaging correctly and changed it

import os
from sklearn.metrics import precision_recall_fscore_support
import sys

def calculateMeasures(folder_gold="data/dev/", folder_pred="data_pred/dev/", remove_anno = ""):
    '''
    Calculate P, R, F1, Macro F
    :param folder_gold: folder containing gold standard .ann files
    :param folder_pred: folder containing prediction .ann files
    :param remove_anno: if set if "rel", relations will be ignored. Use this setting to only evaluate
    keyphrase boundary recognition and keyphrase classification. If set to "types", only keyphrase boundary recognition is evaluated.
    Note that for the later, false positive
    :return:
    '''

    flist_gold = os.listdir(folder_gold)
    res_all_gold = []
    res_all_pred = []
    targets = []

    for f in flist_gold:
        # ignoring non-.ann files, should there be any
        if not str(f).endswith(".ann"):
            continue
        f_gold = open(os.path.join(folder_gold, f), "r")
        try:
            f_pred = open(os.path.join(folder_pred, f), "r")
            res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
        except IOError:
            print(f + " file missing in " + folder_pred + ". Assuming no predictions are available for this file.")
            res_full_pred, res_pred, spans_pred, rels_pred = [], [], [], []

        res_full_gold, res_gold, spans_gold, rels_gold = normaliseAnnotations(f_gold, remove_anno)

        spans_all = set(spans_gold + spans_pred)

        for i, r in enumerate(spans_all):
            if r in spans_gold:
                target = res_gold[spans_gold.index(r)].split(" ")[0]
                res_all_gold.append(target)
                if not target in targets:
                    targets.append(target)
            else:
                # those are the false positives, contained in pred but not gold
                res_all_gold.append("NONE")

            if r in spans_pred:
                target_pred = res_pred[spans_pred.index(r)].split(" ")[0]
                res_all_pred.append(target_pred)
            else:
                # those are the false negatives, contained in gold but not pred
                res_all_pred.append("NONE")


    #y_true, y_pred, labels, targets
    prec, recall, f1, support = precision_recall_fscore_support(
        res_all_gold, res_all_pred, labels=targets, average=None)
    # unpack the precision, recall, f1 and support
    metrics = {}
    for k, target in enumerate(targets):
        metrics[target] = {
            'precision': prec[k],
            'recall': recall[k],
            'f1-score': f1[k],
            'support': support[k]
        }

    # now micro-averaged
    if remove_anno != 'types':
        prec, recall, f1, s = precision_recall_fscore_support(
            res_all_gold, res_all_pred, labels=targets, average='micro')
        metrics['overall'] = {
            'precision': prec,
            'recall': recall,
            'f1-score': f1,
            'support': sum(support)
        }
    else:
        # just binary classification, nothing to average
        metrics['overall'] = metrics['KEYPHRASE-NOTYPES']

    print_report(metrics, targets)
    return metrics


def print_report(metrics, targets, digits=2):
    def _get_line(results, target, columns):
        line = [target]
        for column in columns[:-1]:
            line.append("{0:0.{1}f}".format(results[column], digits))
        line.append("%s" % results[columns[-1]])
        return line

    columns = ['precision', 'recall', 'f1-score', 'support']

    fmt = '%11s' + '%9s' * 4 + '\n'
    report = [fmt % tuple([''] + columns)]
    report.append('\n')
    for target in targets:
        results = metrics[target]
        line = _get_line(results, target, columns)
        report.append(fmt % tuple(line))
    report.append('\n')

    # overall
    line = _get_line(metrics['overall'], 'avg / total', columns)
    report.append(fmt % tuple(line))
    report.append('\n')

    print(''.join(report))


def normaliseAnnotations(file_anno, remove_anno):
    '''
    Parse annotations from the annotation files: remove relations (if requested), convert rel IDs to entity spans
    :param file_anno:
    :param remove_anno:
    :return:
    '''
    res_full_anno = []
    res_anno = []
    spans_anno = []
    rels_anno = []

    for l in file_anno:
        r_g = l.strip().split("\t")
        r_g_offs = r_g[1].split(" ")

        # remove relation instances if specified
        if remove_anno != "" and r_g_offs[0].endswith("-of"):
            continue

        res_full_anno.append(l.strip())
        # normalise relation instances by looking up entity spans for relation IDs
        if r_g_offs[0].endswith("-of"):
            arg1 = r_g_offs[1].replace("Arg1:", "")
            arg2 = r_g_offs[2].replace("Arg2:", "")
            for l in res_full_anno:
                r_g_tmp = l.strip().split("\t")
                if r_g_tmp[0] == arg1:
                    ent1 = r_g_tmp[1].replace(" ", "_")
                if r_g_tmp[0] == arg2:
                    ent2 = r_g_tmp[1].replace(" ", "_")

            spans_anno.append(" ".join([ent1, ent2]))
            res_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
            rels_anno.append(" ".join([r_g_offs[0], ent1, ent2]))

        else:
            spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
            keytype = r_g[1]
            if remove_anno == "types":
                keytype = "KEYPHRASE-NOTYPES"
            res_anno.append(keytype)



    for r in rels_anno:
        r_offs = r.split(" ")
        # reorder hyponyms to start with smallest index
        if r_offs[0] == "Synonym-of" and r_offs[2].split("_")[1] < r_offs[1].split("_")[1]:  # 1, 2
            r = " ".join([r_offs[0], r_offs[2], r_offs[1]])

        # Check, in all other hyponym relations, if the synonymous entity with smallest index is used for them.
        # If not, change it so it is.
        if r_offs[0] == "Synonym-of":
            for r2 in rels_anno:
                r2_offs = r2.split(" ")
                if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[1]:
                    r_new = " ".join([r2_offs[0], r_offs[2], r2_offs[2]])
                    rels_anno[rels_anno.index(r2)] = r_new

                if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[2]:
                    r_new = " ".join([r2_offs[0], r2_offs[1], r_offs[2]])
                    rels_anno[rels_anno.index(r2)] = r_new

    rels_anno = list(set(rels_anno))

    res_full_anno_new = []
    res_anno_new = []
    spans_anno_new = []

    for r in res_full_anno:
        r_g = r.strip().split("\t")
        if r_g[0].startswith("R") or r_g[0] == "*":
            continue
        ind = res_full_anno.index(r)
        res_full_anno_new.append(r)
        res_anno_new.append(res_anno[ind])
        spans_anno_new.append(spans_anno[ind])

    for r in rels_anno:
        res_full_anno_new.append("R\t" + r)
        res_anno_new.append(r)
        spans_anno_new.append(" ".join([r.split(" ")[1], r.split(" ")[2]]))

    return res_full_anno_new, res_anno_new, spans_anno_new, rels_anno


if __name__ == '__main__':
    folder_gold = "data/dev/"
    folder_pred = "data_pred/dev/"
    remove_anno = ""  # "", "rel" or "types"
    if len(sys.argv) >= 2:
        folder_gold = sys.argv[1]
    if len(sys.argv) >= 3:
        folder_pred = sys.argv[2]
    if len(sys.argv) == 4:
        remove_anno = sys.argv[3]

    calculateMeasures(folder_gold, folder_pred, remove_anno)

Answer 1

我自己没有文件，我尝试使用您提供的 "gold" 文件，即：

T1      Process 5 14    oxidation
T2      Material 69 84  Ti-based alloys
T3      Material 186 192    alloys
T4      Task 264 349    understand the role that composition has on the oxidation behavior of Ti-based alloys
T5      Process 312 321 oxidation
T6      Material 334 349    Ti-based alloys
T7      Material 400 415    Ti-based alloys
T8      Material 445 451    alloys
T9      Process 480 489 oxidation

为了使程序能够正确运行而不会在您提到的代码行中得到 'list index out of range' 的错误，最基本的是第一列之间（ 'Ts') 并且第二列有一个制表符，其他列之间有一个 space。未能以这种方式格式化正确的文件（例如，在前两列之间使用 space 而不是制表符）将产生该错误。事实上，

行中真正发生了什么

r_g = l.strip('\n').split("\t")

首先是换行符在行尾被删除，然后该行被制表符分割。这意味着该行被分成两个元素，它们构成了列表 r_g。在这种情况下 r_g_offs 可以正确计算，并且将包含一个元素列表，这些元素是除第一列之外的所有列。在某些情况下，这将在以后使用，例如
spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
仅举一个。

让我们看看不起作用的情况，并尝试理解原因。如果文件 .ann (gold) 不是这样格式化的：

T1\tProcess (tab between)

而是

T1 Process (space)

代码

r_g = l.strip('\n').split("\t")

将生成一个只有一个元素而不是两个元素的列表，例如

r_g = ['T1 Process ...']

在这种情况下，r_g 只有一个元素，即元素 r_g[0]，因此当一个人试图通过 [=31 访问不存在的元素 (r_g[1]) 时=]

r_g_offs = r_g[1].split()

一个人会得到一个

IndexError: list index out of range

在另一种情况下，您可能会遇到上述错误。
在文件末尾空行的情况下，r_g = ['']，这意味着 r_g 是一个只有一个元素的列表。现在，与前面的情况类似，当脚本执行行 r_g_offs = r_g[1].split() 时，将尝试访问 r_g[1]，因为在这种情况下列表中的唯一元素是 r_g[0]，它不存在] 你会得到 'list index out of range' 错误。

我可以的代码运行：

#!/usr/bin/python
# by Mattew Peters, who spotted that sklearn does macro averaging not
# micro averaging correctly and changed it

import os
from sklearn.metrics import precision_recall_fscore_support
import sys

def calculateMeasures(folder_gold="data/dev/", folder_pred="data_pred/dev/", remove_anno=""):
    '''
    Calculate P, R, F1, Macro F
    :param folder_gold: folder containing gold standard .ann files
    :param folder_pred: folder containing prediction .ann files
    :param remove_anno: if set if "rel", relations will be ignored. Use this setting to only evaluate
    keyphrase boundary recognition and keyphrase classification. If set to "types", only keyphrase boundary recognition is evaluated.
    Note that for the later, false positive
    :return:
    '''

    flist_gold = os.listdir(folder_gold)
    res_all_gold = []
    res_all_pred = []
    targets = []

    for f in flist_gold:
        # ignoring non-.ann files, should there
        # be any
        if not str(f).endswith(".ann"):
            continue
        f_gold = open(os.path.join(folder_gold, f), "r")
        try:
            f_pred = open(os.path.join(folder_pred, f), "r")
            res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
        except IOError:
            print(f + " file missing in " + folder_pred + ". Assuming no predictions are available for this file.")
            res_full_pred, res_pred, spans_pred, rels_pred = [], [], [], []

        res_full_gold, res_gold, spans_gold, rels_gold = normaliseAnnotations(f_gold, remove_anno)

        spans_all = set(spans_gold + spans_pred)

        for i, r in enumerate(spans_all):
            if r in spans_gold:
                target = res_gold[spans_gold.index(r)].split(" ")[0]
                res_all_gold.append(target)
                if not target in targets:
                    targets.append(target)
            else:

                res_all_gold.append("NONE")

            if r in spans_pred:
                target_pred = res_pred[spans_pred.index(r)].split(" ")[0]
                res_all_pred.append(target_pred)
            else:

                res_all_pred.append("NONE")

        #y_true, y_pred, labels, targets
        prec, recall, f1, support = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average=None)
        metrics = {}
        for k, target in enumerate(targets):
            metrics[target] = {
                'precision': prec[k],
                'recall': recall[k],
                'f1-score': f1[k],
                'support': support[k]
            }

        # now
        # micro-averaged
        if remove_anno != 'types':
            prec, recall, f1, s = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average='micro')
            metrics['overall'] = {
                'precision': prec,
                'recall': recall,
                'f1-score': f1,
                'support': sum(support)
            }
        else:
            # just
            # binary
            # classification,
            # nothing
            # to
            # average
            metrics['overall'] = metrics['KEYPHRASE-NOTYPES']

    print_report(metrics, targets)
    return metrics

def print_report(metrics, targets, digits=2):
    def _get_line(results, target, columns):
        line = [target]
        for column in columns[:-1]:
            line.append("{0:0.{1}f}".format(results[column], digits))
        line.append("%s" % results[columns[-1]])
        return line

    columns = ['precision', 'recall', 'f1-score', 'support']

    fmt = '%11s' + '%9s' * 4 + '\n'
    report = [fmt % tuple([''] + columns)]
    report.append('\n')
    for target in targets:
        results = metrics[target]
        line = _get_line(results, target, columns)
        report.append(fmt % tuple(line))
    report.append('\n')

    # overall
    line = _get_line(
    metrics['overall'], 'avg / total', columns)
    report.append(fmt % tuple(line))
    report.append('\n')

    print(''.join(report))

def normaliseAnnotations(file_anno, remove_anno):
    '''
    Parse annotations from the annotation files: remove relations (if requested), convert rel IDs to entity spans
    :param file_anno:
    :param remove_anno:
    :return:
    '''
    res_full_anno = []
    res_anno = []
    spans_anno = []
    rels_anno = []

    for l in file_anno:
        print(l)
        print(l.strip('\n'))
        r_g = l.strip('\n').split("\t")
        print(r_g)
        print(len(r_g))
        r_g_offs = r_g[1].split()
        print(r_g_offs)
        if remove_anno != "" and r_g_offs[0].endswith("-of"):
            continue

        res_full_anno.append(l.strip())

        if r_g_offs[0].endswith("-of"):
            arg1 = r_g_offs[1].replace("Arg1:", "")
            arg2 = r_g_offs[2].replace("Arg2:", "")
            for l in res_full_anno:
                r_g_tmp = l.strip().split("\t")
                if r_g_tmp[0] == arg1:
                    ent1 = r_g_tmp[1].replace(" ", "_")
                if r_g_tmp[0] == arg2:
                    ent2 = r_g_tmp[1].replace(" ", "_")

            spans_anno.append(" ".join([ent1, ent2]))
            res_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
            rels_anno.append(" ".join([r_g_offs[0], ent1, ent2]))

        else:
            spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
            keytype = r_g[1]
            if remove_anno == "types":
                keytype = "KEYPHRASE-NOTYPES"
            res_anno.append(keytype)

    for r in rels_anno:
        r_offs = r.split(" ")
# reorder hyponyms to start with smallest index
# 1, 2
        if r_offs[0] == "Synonym-of" and r_offs[2].split("_")[1] < r_offs[1].split("_")[1]:
            r = " ".join([r_offs[0], r_offs[2], r_offs[1]])
        if r_offs[0] == "Synonym-of":
            for r2 in rels_anno:
                r2_offs = r2.split(" ")
                if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[1]:
                    r_new = " ".join([r2_offs[0], r_offs[2], r2_offs[2]])
                    rels_anno[rels_anno.index(r2)] = r_new

                if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[2]:
                    r_new = " ".join([r2_offs[0], r2_offs[1], r_offs[2]])
                    rels_anno[rels_anno.index(r2)] = r_new

    rels_anno = list(set(rels_anno))

    res_full_anno_new = []
    res_anno_new = []
    spans_anno_new = []

    for r in res_full_anno:
        r_g = r.strip().split("\t")
        if r_g[0].startswith("R") or r_g[0] == "*":
            continue
        ind = res_full_anno.index(r)
        res_full_anno_new.append(r)
        res_anno_new.append(res_anno[ind])
        spans_anno_new.append(spans_anno[ind])

    for r in rels_anno:
        res_full_anno_new.append("R\t" + r)
        res_anno_new.append(r)
        spans_anno_new.append(" ".join([r.split(" ")[1], r.split(" ")[2]]))

    return res_full_anno_new, res_anno_new, spans_anno_new, rels_anno

if __name__ == '__main__':
    folder_gold = "data/dev/"
    folder_pred = "data_pred/dev/"
    remove_anno = ""  # "", "rel" or "types"
    if len(sys.argv) >= 2:
        folder_gold = sys.argv[1]
    if len(sys.argv) >= 3:
        folder_pred = sys.argv[2]

From the two cases shown above, we can conclude that the script is very sensible to how the file are formatted/written (tab, spaces and no empty line at the end), so care will be needed when producing those files and feeding them to the main script.

error when comparing files : IndexError: list index out of range

error when comparing files : IndexError: list index out of range

python

offset