通过 Gensim 访问 MALLET 的诊断文件

Accessing MALLET's diagnostics file via Gensim

有没有办法在 Python 中访问 MALLET 的 diagnostics file or its content by using the provided API via Gensim

好像没有这个可能。 我通过 Python 的子进程模块在命令行中通过 运行 MALLET 解决了这个问题:

import subprocess
from pathlib import Path

MALLET_PATH = r"C:\mallet"  # set to where your "bin/mallet" path is

seglen = 500
topic_count = 20
start = 0
iterations = 20
num_threads = 10  # determines threads used for parallel training

# remember to change backslashes if needed
wdir = Path("../..")

corpusdir = wdir.joinpath("5_corpus", f"seglen-{seglen}")
corpusdir.mkdir(exist_ok=True, parents=True)
mallet_dir = wdir.joinpath("6_evaluation/models/mallet", f"seglen-{seglen}")

topic_dir = mallet_dir.joinpath(f"topics-{topic_count}")


def create_input_files():
    # create MALLETs input files
    for file in corpusdir.glob("*.txt"):
        output = mallet_dir.joinpath(f"{file.stem}.mallet")
        # doesn't need to happen more than once -- usually.
        if output.is_file(): continue
        print(f"--{file.stem}")
        cmd = f"bin\mallet import-file " \
              f"--input {file.absolute()} " \
              f"--output {output.absolute()} " \
              f"--keep-sequence"
        subprocess.call(cmd, cwd=MALLET_PATH, shell=True)
    print("import finished")


def modeling():
    # start modeling
    for file in mallet_dir.glob("*.mallet"):
        for i in range(start, iterations):
            print("iteration ", str(i))
            print(f"--{file.stem}")
            # output directory
            formatdir = topic_dir.joinpath(f"{file.stem.split('-')[0]}")
            outputdir = formatdir.joinpath(f"iteration-{i}")
            outputdir.mkdir(parents=True, exist_ok=True)
            outputdir = str(outputdir.absolute())
            # output files
            statefile = outputdir + r"\topic-state.gz"
            keysfile = outputdir + r"\keys.txt"
            compfile = outputdir + r"\composition.txt"
            diagnostics_xml = outputdir + r"\diagnostics.xml"
            # building cmd string
            cmd = f"bin\mallet train-topics " \
                  f"--input {file.absolute()} " \
                  f"--num-topics {topic_count} " \
                  f"--output-state {statefile} " \
                  f"--output-topic-keys {keysfile} " \
                  f"--output-doc-topics {compfile} " \
                  f"--diagnostics-file {diagnostics_xml} " \
                  f"--num-threads {num_threads}"
            # call mallet
            subprocess.call(cmd, cwd=MALLET_PATH, shell=True)
    print("models trained")

#create_input_files()
modeling()