简单的字数统计 Rust 程序输出有效的标准输出,但是当通过管道传输到具有特定内容的 head 程序时会出现恐慌

Simple word count rust program outputs valid stdout but panicks when piped to head program with specific content

我有生锈的痕迹:


thread 'main' panicked at 'failed printing to stdout: Broken pipe (os error 32)', library/std/src/io/stdio.rs:993:9
stack backtrace:
   0:     0x559ffa959dc0 - std::backtrace_rs::backtrace::libunwind::trace::h72c2fb8038f1bbee
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/../../backtrace/src/backtrace/libunwind.rs:96
   1:     0x559ffa959dc0 - std::backtrace_rs::backtrace::trace_unsynchronized::h1e3b084883f1e78c
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/../../backtrace/src/backtrace/mod.rs:66
   2:     0x559ffa959dc0 - std::sys_common::backtrace::_print_fmt::h3bf6a7ebf7f0394a
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/sys_common/backtrace.rs:79
   3:     0x559ffa959dc0 - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::fmt::h2e8cb764b7fe02e7
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/sys_common/backtrace.rs:58
   4:     0x559ffa972f6c - core::fmt::write::h7a1184eaee6a8644
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/core/src/fmt/mod.rs:1080
   5:     0x559ffa957b12 - std::io::Write::write_fmt::haeeb374d93a67eac
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/io/mod.rs:1516
   6:     0x559ffa95beed - std::sys_common::backtrace::_print::h1d14a7f6ad632dc8
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/sys_common/backtrace.rs:61
   7:     0x559ffa95beed - std::sys_common::backtrace::print::h301abac8bb2e3e81
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/sys_common/backtrace.rs:48
   8:     0x559ffa95beed - std::panicking::default_hook::{{closure}}::hde0cb80358a6920a
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:208
   9:     0x559ffa95bb98 - std::panicking::default_hook::h9b1a691049a0ec8f
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:227
  10:     0x559ffa95c5d1 - std::panicking::rust_panic_with_hook::h2bdec87b60580584
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:577
  11:     0x559ffa95c179 - std::panicking::begin_panic_handler::{{closure}}::h101ca09d9df5db47
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:484
  12:     0x559ffa95a22c - std::sys_common::backtrace::__rust_end_short_backtrace::h3bb85654c20113ca
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/sys_common/backtrace.rs:153
  13:     0x559ffa95c139 - rust_begin_unwind
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:483
  14:     0x559ffa95c0eb - std::panicking::begin_panic_fmt::hf0503558fbe5b251
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:437
  15:     0x559ffa957022 - std::io::stdio::print_to::h9435376f36962f3f
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/io/stdio.rs:993
  16:     0x559ffa957022 - std::io::stdio::_print::h0d31d4b9faa6e1ec
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/io/stdio.rs:1005
  17:     0x559ffa944807 - wordstats::main::h1c2ea6400047a5eb
  18:     0x559ffa942e73 - std::sys_common::backtrace::__rust_begin_short_backtrace::h9e31cf87ddc88116
  19:     0x559ffa942e49 - std::rt::lang_start::{{closure}}::h6c6491f05894818f
  20:     0x559ffa95c9f7 - core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once::he179d32a5d10d957
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/core/src/ops/function.rs:259
  21:     0x559ffa95c9f7 - std::panicking::try::do_call::hcb3d5e7be089b2b4
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:381
  22:     0x559ffa95c9f7 - std::panicking::try::h7ac93b0cd56fb701
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panicking.rs:345
  23:     0x559ffa95c9f7 - std::panic::catch_unwind::h7b40e396c93a4fcd
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/panic.rs:382
  24:     0x559ffa95c9f7 - std::rt::lang_start_internal::h142b9cc66267fea1
                               at /rustc/7eac88abb2e57e752f3302f02be5f3ce3d7adfb4/library/std/src/rt.rs:51
  25:     0x559ffa944ae2 - main
  26:     0x7f6223a380b3 - __libc_start_main
  27:     0x559ffa94209e - _start
  28:                0x0 - <unknown>

当我编译这个程序时

use diacritics;
use std::collections::HashMap;
use std::io;
use std::io::prelude::*;

#[derive(Debug)]
struct Entry {
    word: String,
    count: u32,
}

static SEPARATORS: &'static [char] = &[
    ' ', ',', '.', '!', '?', '\'', '"', '\n', '(', ')', '#', '{', '}', '[', ']', '-', ';', ':',
];

fn main() {
    let mut words: HashMap<String, u32> = HashMap::new();
    let stdin = io::stdin();
    for line in stdin.lock().lines() {
        line_processor(line.unwrap(), &mut words)
    }
    output(&mut words);
}

fn line_processor(line: String, words: &mut HashMap<String, u32>) {
    let formatted_line;
    let mut word = String::new();
    formatted_line = diacritics::remove_diacritics(&line).to_lowercase();

    for c in formatted_line.chars() {
        if SEPARATORS.contains(&c) {
            add_word(word, words);
            word = String::new();
        } else {
            word.push_str(&c.to_string());
        }
    }
}

fn add_word(word: String, words: &mut HashMap<String, u32>) {
    if word.len() > 0 {
        if words.contains_key::<str>(&word) {
            words.insert(word.to_string(), words.get(&word).unwrap() + 1);
        } else {
            words.insert(word.to_string(), 1);
        }
        // println!("word >{}<", word.to_string())
    }
}

fn output(words: &mut HashMap<String, u32>) {
    let mut stack = Vec::<Entry>::new();

    for (k, v) in words {
        stack.push(Entry {
            word: k.to_string(),
            count: *v,
        });
    }

    stack.sort_by(|a, b| b.count.cmp(&a.count));
    stack.reverse();

    while let Some(entry) = stack.pop() {
        println!("{}\t{}", entry.count, entry.word);
    }
}

这样:

cargo build --release

我运行程序是这样的:

cat src/sample.txt | ./target/release/wordstats  | head -n 50

这个程序应该只显示类似这样的内容(最高字数),没有任何痕迹:

15  the
14  in
11  are
10  and
10  of
9   species
9   bats
8   horseshoe
8   is
6   or
6   as
5   which
5   their

一些回显内容或其他一些文件(例如cat src/main.rs | ...)就是这种情况 但不是 this file content,它是随机维基百科页面的一部分。

我的程序是一个愚蠢的字数统计程序,它只打印表格排序的键值列表。

当我将结果通过管道传输到 head -n 50 程序时会出现问题,但当我打印完整输出时不会出现问题

知道为什么我得到这样的痕迹吗?我是不是在我的程序中以错误的方式处理某些事情,或者它可能与其他事情有关(rust lib / unix 错误行为)

我的 rustc 版本是:rustc 1.48.0 (7eac88abb 2020-11-16)

编辑:

补缺Cargo.toml

[package]
name = "wordstats"
version = "0.1.0"
authors = ["Eric Régnier <utopman@gmail.com>"]
edition = "2018"

[dependencies]
diacritics = "0.1.1"

首先,您没有提供足够的信息来重现您的问题。您提供了使用第三方依赖项的源代码,但忽略了提供 Cargo.toml。在您的情况下,在不影响手头问题的情况下删除依赖项的使用非常容易,所以这就是我所做的。

其次,在非玩具命令行程序中使用 println! 正是出于这个原因。即,有两个问题结合在一起会产生这种不良行为:

  1. println! 如果在写入 stdout 时发生任何错误,将会出现 panic。
  2. Rust 的运行时所做的少数事情之一是 ignore SIGPIPE,这意味着您的应用程序不会收到 PIPE 信号,而是对已关闭的文件描述符的相应写入 returns 而是一个错误。 (在那个 link 中,您可以看到我公开主张改变这种行为。)

在典型的 C 程序中,SIGPIPE 不会 被忽略。它通常也没有明确处理。当进程收到它不处理的信号时,进程终止。在这种情况下,这正是您想要的。一旦 head 停止读取它的标准输入(你的标准输出),你 想要 你的程序停止,但你也希望它优雅地停止而不是恐慌或打印错误。因为那是 Unix CLI 实用程序所做的。

你有两种方法可以解决这个问题。一种方法是更改​​代码以显式处理 BrokenPipe 错误。您的代码是以一种不会发生错误的方式编写的,因为您 unwrap 读取标准输出的结果。所以你的程序不是惯用的,也没有设置来处理错误。因此,为了正确处理 BrokenPipe,我不得不做一些小改动,以便它正确冒出错误:

use std::collections::HashMap;
use std::io;
use std::io::prelude::*;

#[derive(Debug)]
struct Entry {
    word: String,
    count: u32,
}

static SEPARATORS: &'static [char] = &[
    ' ', ',', '.', '!', '?', '\'', '"', '\n', '(', ')', '#', '{', '}', '[', ']', '-', ';', ':',
];

fn main() {
    if let Err(err) = try_main() {
        if err.kind() == std::io::ErrorKind::BrokenPipe {
            return;
        }
        // Ignore any error that may occur while writing to stderr.
        let _ = writeln!(std::io::stderr(), "{}", err);
    }
}

fn try_main() -> Result<(), std::io::Error> {
    let mut words: HashMap<String, u32> = HashMap::new();
    let stdin = io::stdin();
    for result in stdin.lock().lines() {
        let line = result?;
        line_processor(line, &mut words)
    }
    output(&mut words)?;
    Ok(())
}

fn line_processor(line: String, words: &mut HashMap<String, u32>) {
    let mut word = String::new();

    for c in line.chars() {
        if SEPARATORS.contains(&c) {
            add_word(word, words);
            word = String::new();
        } else {
            word.push_str(&c.to_string());
        }
    }
}

fn add_word(word: String, words: &mut HashMap<String, u32>) {
    if word.len() > 0 {
        if words.contains_key::<str>(&word) {
            words.insert(word.to_string(), words.get(&word).unwrap() + 1);
        } else {
            words.insert(word.to_string(), 1);
        }
        // println!("word >{}<", word.to_string())
    }
}

fn output(words: &mut HashMap<String, u32>) -> Result<(), std::io::Error> {
    let mut stack = Vec::<Entry>::new();

    for (k, v) in words {
        stack.push(Entry {
            word: k.to_string(),
            count: *v,
        });
    }

    stack.sort_by(|a, b| b.count.cmp(&a.count));
    stack.reverse();

    let stdout = io::stdout();
    let mut stdout = stdout.lock();
    while let Some(entry) = stack.pop() {
        writeln!(stdout, "{}\t{}", entry.count, entry.word)?;
    }
    Ok(())
}

第二种处理方法是返回到 SIGPIPE 的默认行为。这将使您的 Rust 应用程序表现得像 C 应用程序。这可以通过定义一个函数来将 SIGPIPE 的信号处理程序重置为 SIG_DFL:

来实现
#[cfg(unix)]
fn reset_sigpipe() {
    unsafe {
        libc::signal(libc::SIGPIPE, libc::SIG_DFL);
    }
}

#[cfg(not(unix))]
fn reset_sigpipe() {
    // no-op
}

然后在main中将其作为第一件事调用。然后您可以删除对 BrokenPipe 错误的任何特定处理,因为它不会发生。相反,您的进程将收到一个 PIPE 信号,然后它将终止。这是完整的代码:

use std::collections::HashMap;
use std::io;
use std::io::prelude::*;

#[derive(Debug)]
struct Entry {
    word: String,
    count: u32,
}

static SEPARATORS: &'static [char] = &[
    ' ', ',', '.', '!', '?', '\'', '"', '\n', '(', ')', '#', '{', '}', '[', ']', '-', ';', ':',
];

fn main() {
    if let Err(err) = try_main() {
        let _ = writeln!(std::io::stderr(), "{}", err);
    }
}

fn try_main() -> Result<(), std::io::Error> {
    reset_sigpipe();
    
    let mut words: HashMap<String, u32> = HashMap::new();
    let stdin = io::stdin();
    for result in stdin.lock().lines() {
        let line = result?;
        line_processor(line, &mut words)
    }
    output(&mut words)?;
    Ok(())
}

fn line_processor(line: String, words: &mut HashMap<String, u32>) {
    let mut word = String::new();

    for c in line.chars() {
        if SEPARATORS.contains(&c) {
            add_word(word, words);
            word = String::new();
        } else {
            word.push_str(&c.to_string());
        }
    }
}

fn add_word(word: String, words: &mut HashMap<String, u32>) {
    if word.len() > 0 {
        if words.contains_key::<str>(&word) {
            words.insert(word.to_string(), words.get(&word).unwrap() + 1);
        } else {
            words.insert(word.to_string(), 1);
        }
        // println!("word >{}<", word.to_string())
    }
}

fn output(words: &mut HashMap<String, u32>) -> Result<(), std::io::Error> {
    let mut stack = Vec::<Entry>::new();

    for (k, v) in words {
        stack.push(Entry {
            word: k.to_string(),
            count: *v,
        });
    }

    stack.sort_by(|a, b| b.count.cmp(&a.count));
    stack.reverse();

    let stdout = io::stdout();
    let mut stdout = stdout.lock();
    while let Some(entry) = stack.pop() {
        writeln!(stdout, "{}\t{}", entry.count, entry.word)?;
    }
    Ok(())
}


#[cfg(unix)]
fn reset_sigpipe() {
    unsafe {
        libc::signal(libc::SIGPIPE, libc::SIG_DFL);
    }
}

#[cfg(not(unix))]
fn reset_sigpipe() {
    // no-op
}