NLTK 资源的 Pyodide 文件系统:丢失的文件
Pyodide filesystem for NLTK resources : missing files
我正在尝试使用 NLTK in browser, thanks to pyodide。
Pyodide 启动良好,设法加载 NLTK,打印其版本。
尽管如此,虽然包下载看起来不错,但在调用 nltk.sent_tokenize(str)
时,NLTK 引发了找不到包“punkt”的错误。
我会说下载的资源在某处丢失了,但我不太了解 Pyodide / WebAssembly 是如何管理文件的。有什么见解吗?
简单版:
import nltk
nltk.download(pkg)
for sent in nltk.sent_tokenize("Test string"):
print(sent)
具有更多详细信息的版本,指定下载目录和服务器 url。
import nltk
pkg = "punkt"
downloader = nltk.downloader.Downloader(server_index_url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml")
downloader.download(pkg, download_dir='/nltk_data')
downloader.status(pkg)
for sent in nltk.sent_tokenize("Test string"):
print(sent)
完整示例代码:
<!DOCTYPE html>
<html>
<body>
<script type="text/javascript" src="https://cdn.jsdelivr.net/pyodide/v0.18.0/full/pyodide.js"></script>
<script type="text/javascript">
// init Pyodide
async function pyodide_loader() {
let pyodide_premise = loadPyodide({
indexURL: "https://cdn.jsdelivr.net/pyodide/v0.18.0/full/",
});
let pyodide = await pyodide_premise;
await pyodide.loadPackage("micropip");
await pyodide.loadPackage("nltk");
return pyodide_premise;
}
let pyodideReadyPromise = pyodide_loader();
// run Python code and load NLTK
async function load_packages() {
let pyodide = await pyodideReadyPromise;
let output = pyodide.runPython(`
print(f"*** import nltk")
import nltk
print(f"*** NLTK version {nltk.__version__=} imported, downloading resources now")
pkg = "punkt"
nltk.download(pkg)
str = "Just for testing"
for sent in nltk.sent_tokenize(str):
print(sent)
`);
}
load_packages()
</script>
</body>
</html>
简短的回答是,使用 Python 下载文件目前在 Pyodide 中不起作用,因为 http.client
、requests
等需要 POSIX 套接字,而这些套接字在浏览器虚拟机。
令人好奇的是 nltk.download
没有出错——它应该有。
解决方法是手动下载所需的资源,例如,使用 JavaScript fetch API as illustrated in this comment;
from js import fetch
response = await fetch("<url>")
js_buffer = await response.arrayBuffer()
py_buffer = js_buffer.to_py() # this is a memoryview
stream = py_buffer.tobytes() # now we have a bytes object
# that we can finally write under the appropriate path
with open("<file_path>", "wb") as fh:
fh.write(stream)
I didn't understand well how Pyodide / WebAssembly manage files.
默认情况下它是虚拟文件系统 (MEMFS) that gets reset at each page load. You can access it with standard python tools (open
, 'os', etc). If necessary you can also mount a persistent filesystem.
这是一个使用 pyodide
v0.18.1 加载 punkt
的工作示例。我试图 post 这是对 @rth 接受的答案的评论,但字符数超过了 240 个字符的限制。
from js import fetch
import nltk
from pathlib import Path
import os, sys, io, zipfile
response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
js_buffer = await response.arrayBuffer()
py_buffer = js_buffer.to_py() # this is a memoryview
stream = py_buffer.tobytes() # now we have a bytes object
d = Path("/nltk_data/tokenizers")
d.mkdir(parents=True, exist_ok=True)
Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)
# extract punkt.zip
zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
path='/nltk_data/tokenizers/'
)
# check file contents in /nltk_data/tokenizers/
# print(os.listdir("/nltk_data/tokenizers/punkt"))
nltk.word_tokenize("some text here")
我从 pyodide
维护者和 https://github.com/pyodide/pyodide/issues/1798 的其他优秀人员那里得到了很多帮助来解决这个问题。谢谢!
我正在尝试使用 NLTK in browser, thanks to pyodide。 Pyodide 启动良好,设法加载 NLTK,打印其版本。
尽管如此,虽然包下载看起来不错,但在调用 nltk.sent_tokenize(str)
时,NLTK 引发了找不到包“punkt”的错误。
我会说下载的资源在某处丢失了,但我不太了解 Pyodide / WebAssembly 是如何管理文件的。有什么见解吗?
简单版:
import nltk
nltk.download(pkg)
for sent in nltk.sent_tokenize("Test string"):
print(sent)
具有更多详细信息的版本,指定下载目录和服务器 url。
import nltk
pkg = "punkt"
downloader = nltk.downloader.Downloader(server_index_url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml")
downloader.download(pkg, download_dir='/nltk_data')
downloader.status(pkg)
for sent in nltk.sent_tokenize("Test string"):
print(sent)
完整示例代码:
<!DOCTYPE html>
<html>
<body>
<script type="text/javascript" src="https://cdn.jsdelivr.net/pyodide/v0.18.0/full/pyodide.js"></script>
<script type="text/javascript">
// init Pyodide
async function pyodide_loader() {
let pyodide_premise = loadPyodide({
indexURL: "https://cdn.jsdelivr.net/pyodide/v0.18.0/full/",
});
let pyodide = await pyodide_premise;
await pyodide.loadPackage("micropip");
await pyodide.loadPackage("nltk");
return pyodide_premise;
}
let pyodideReadyPromise = pyodide_loader();
// run Python code and load NLTK
async function load_packages() {
let pyodide = await pyodideReadyPromise;
let output = pyodide.runPython(`
print(f"*** import nltk")
import nltk
print(f"*** NLTK version {nltk.__version__=} imported, downloading resources now")
pkg = "punkt"
nltk.download(pkg)
str = "Just for testing"
for sent in nltk.sent_tokenize(str):
print(sent)
`);
}
load_packages()
</script>
</body>
</html>
简短的回答是,使用 Python 下载文件目前在 Pyodide 中不起作用,因为 http.client
、requests
等需要 POSIX 套接字,而这些套接字在浏览器虚拟机。
令人好奇的是 nltk.download
没有出错——它应该有。
解决方法是手动下载所需的资源,例如,使用 JavaScript fetch API as illustrated in this comment;
from js import fetch
response = await fetch("<url>")
js_buffer = await response.arrayBuffer()
py_buffer = js_buffer.to_py() # this is a memoryview
stream = py_buffer.tobytes() # now we have a bytes object
# that we can finally write under the appropriate path
with open("<file_path>", "wb") as fh:
fh.write(stream)
I didn't understand well how Pyodide / WebAssembly manage files.
默认情况下它是虚拟文件系统 (MEMFS) that gets reset at each page load. You can access it with standard python tools (open
, 'os', etc). If necessary you can also mount a persistent filesystem.
这是一个使用 pyodide
v0.18.1 加载 punkt
的工作示例。我试图 post 这是对 @rth 接受的答案的评论,但字符数超过了 240 个字符的限制。
from js import fetch
import nltk
from pathlib import Path
import os, sys, io, zipfile
response = await fetch('https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip')
js_buffer = await response.arrayBuffer()
py_buffer = js_buffer.to_py() # this is a memoryview
stream = py_buffer.tobytes() # now we have a bytes object
d = Path("/nltk_data/tokenizers")
d.mkdir(parents=True, exist_ok=True)
Path('/nltk_data/tokenizers/punkt.zip').write_bytes(stream)
# extract punkt.zip
zipfile.ZipFile('/nltk_data/tokenizers/punkt.zip').extractall(
path='/nltk_data/tokenizers/'
)
# check file contents in /nltk_data/tokenizers/
# print(os.listdir("/nltk_data/tokenizers/punkt"))
nltk.word_tokenize("some text here")
我从 pyodide
维护者和 https://github.com/pyodide/pyodide/issues/1798 的其他优秀人员那里得到了很多帮助来解决这个问题。谢谢!