使用 boost 文件系统在 C++ 中搜索部分文件名

Search partial filenames in C++ using boost filesystem

问题很简单,我想在一个目录中找到一个文件路径,但我只有部分文件名,所以这里是这个任务的函数

void getfiles(const fs::path& root, const string& ext, vector<fs::path>& ret)
{
    if(!fs::exists(root) || !fs::is_directory(root)) return;

    fs::recursive_directory_iterator it(root);
    fs::recursive_directory_iterator endit;
    while(it != endit)
    {
        if(fs::is_regular_file(*it)&&it->path().extension()==ext) ret.push_back(it->path());//
        ++it;

    }

}

bool find_file(const filesystem::path& dir_path, const filesystem::path file_name, filesystem::path& path_found) {
    const fs::recursive_directory_iterator end;
    const auto it = find_if(fs::recursive_directory_iterator(dir_path), end,
                            [file_name](fs::path e) {


            cerr<<boost::algorithm::icontains(e.filename().native() ,file_name.native())<<endl;
            return boost::algorithm::icontains(e.filename().native() ,file_name.native());//
});

    if (it == end) {
        return false;
    } else {
        path_found = it->path();
        return true;
    }
}


int main (int argc, char* argv[]) 
{
    vector<fs::path> inputClass ;
    fs::path textFiles,datasetPath,imgpath;
    textFiles=argv[1];
    datasetPath=argv[2];

    getfiles(textFiles,".txt",inputClass);
    
    for (int i=0;i<inputClass.size();i++)
        
    {
        ifstream lblFile(inputClass[i].string().c_str());
        string line;
        fs::path classname=inputClass[i].parent_path()/inputClass[i].stem().string();
        cerr<<classname.stem()<<endl;
        while (getline(lblFile,line))
        {
            
            bool find=find_file(datasetPath,line,imgpath);
            if (find)
            {
                
                while(!fs::exists(classname))
                    fs::create_directories (classname);
                fs::copy(imgpath,classname/imgpath.filename());
                cerr<<"Found\n";
            }
            else
                cerr<<"Not Found \n";
            
            
        }
        lblFile.close();
    }
    
    
}

控制台输出:

"490"
vfv343434.jpeg||E9408000EC0
0
fsdfdsfdfsf.jpeg||E9408000EC0
0
1200E9408000EC0.jpeg||E9408000EC0
0
Not Found 

但是当我手动设置搜索字符串时它工作正常!我尝试了其他搜索字符串的方法,如 std::find 但所有方法都找不到子字符串,输入字符串(行)似乎有问题我打印了所有字符但没有特殊字符或任何东西。 如果我手动设置搜索字符串,它会按预期工作

string search="E9408000EC0";
        cerr<<e.filename().native()<<"||"<<search<<endl;
        cerr<<boost::algorithm::icontains(e.filename().native() ,search)<<endl;

 

以上变化的结果就像

"490"
vfv343434.jpeg||E9408000EC0
0
fsdfdsfdfsf.jpeg||E9408000EC0
0
1200E9408000EC0.jpeg||E9408000EC0
1
Found

我无法复制这个。

我唯一的预感是,在您的平台上,也许 string() 访问器不是 returning 纯字符串,而是例如引用的路径。那会破坏搜索。考虑改用 native() 访问器。

(事实上,由于file_name不是路径,而是字符串模式,建议passing the argument as std::string__view or similar instead。)

Live On Coliru

#include <boost/filesystem.hpp>
#include <boost/algorithm/string.hpp>
#include <iostream>
namespace fs = boost::filesystem;

template <typename Out>
void find_file(const fs::path& dir_path, const fs::path file_name, Out out) {
    fs::recursive_directory_iterator it(dir_path), end;
    std::copy_if(it, end, out, [file_name](fs::path e) {
        return boost::algorithm::icontains(e.filename().native(),
                                           file_name.native());
    });
}

int main() {
    fs::path d = "a/b/c/e";
    fs::create_directories(d);
    {
        std::ofstream ofs(d / "1200E9408000EC0.jpeg");
    }

    std::cout << fs::path("000EC0").native() << "\n";

    std::vector<fs::path> found;
    find_file(".", "000EC0", back_inserter(found));

    for (auto &f : found)
    {
        std::cout << "Found: " << f << "\n";
    }
}

版画

000EC0
Found: "./a/b/c/e/1200E9408000EC0.jpeg"

更新:代码审查

对于更新后的问题,提出了一个稍微改进的测试器,它可以与 boost::filesystemstd::filesystem 一起使用。

有许多小改进(删除重复、显式转换、使用可选到 return 可选匹配等

还添加了一个空格 trim 以避免在输入行中出现多余的空格:

#include <boost/algorithm/string.hpp>
#include <fstream>
#include <iostream>

using boost::algorithm::icontains;
using boost::algorithm::trim;

#if defined(USE_BOOST_FS)
    #include <boost/filesystem.hpp>
    namespace fs = boost::filesystem;
    using boost::system::error_code;
#else
    #include <filesystem>
    namespace fs = std::filesystem;
    using std::error_code;
#endif

void getfiles(
    const fs::path& root, const std::string& ext, std::vector<fs::path>& ret)
{
    if (!exists(root) || !is_directory(root))
        return;

    for (fs::recursive_directory_iterator it(root), endit; it != endit; ++it) {
        if (is_regular_file(*it) && it->path().extension() == ext)
            ret.push_back(it->path()); //
    }
}

std::optional<fs::path> find_file(const fs::path& dir_path, fs::path partial)
{
    fs::recursive_directory_iterator end,
        it = fs::recursive_directory_iterator(dir_path);

    it = std::find_if(it, end, [partial](fs::path e) {
        auto search = partial.native();
        //std::cerr << e.filename().native() << "||" << search << std::endl;
        auto matches = icontains(e.filename().native(), search);
        std::cerr << e << " Matches: " << std::boolalpha << matches
                  << std::endl;
        return matches;
    });

    return (it != end)
        ? std::make_optional(it->path())
        : std::nullopt;
}

auto readInputClass(fs::path const& textFiles)
{
    std::vector<fs::path> found;
    getfiles(textFiles, ".txt", found);
    return found;
}

int main(int argc, char** argv)
{
    std::vector<std::string> const args(argv, argv + argc);
    auto const textFiles = readInputClass(args.at(1));
    std::string const datasetPath = args.at(2);

    for (fs::path classname : textFiles) {
        // open the text file
        std::ifstream lblFile(classname);

        // use base without extension as output directory
        classname.replace_extension();
        if (!fs::exists(classname)) {
            if (fs::create_directories(classname))
                std::cerr << classname << " created" << std::endl;
        }

        for (std::string line; getline(lblFile, line);) {
            trim(line);

            if (auto found = find_file(datasetPath, line)) {
                auto dest = classname / found->filename();

                error_code ec;
                copy(*found, dest, ec);
                std::cerr << dest << " (" << ec.message() << ")\n";
            } else {
                std::cerr << "Not Found \n";
            }
        }
    }
}

使用

从头开始测试
mkdir -pv textfiles dataset
touch dataset/{vfv343434,fsdfdsfdfsf,1200E9408000EC0}.jpeg
echo 'E9408000EC0 ' > textfiles/490.txt

运行

./a.out textfiles/ dataset/

版画

"textfiles/490" created
"dataset/1200E9408000EC0.jpeg" Matches: true
"textfiles/490/1200E9408000EC0.jpeg" (Success)

或随后 运行

"dataset/fsdfdsfdfsf.jpeg" Matches: false
"dataset/1200E9408000EC0.jpeg" Matches: true
"textfiles/490/1200E9408000EC0.jpeg" (File exists)

奖金

进行更多诊断并避免为每个模式重复遍历文件系统。现在的主程序是:

Live On Coliru

int main(int argc, char** argv)
{
    std::vector<std::string> const args(argv, argv + argc);

    Paths const classes = getfiles(args.at(1), ".txt");
    Mappings map = readClassMappings(classes);

    std::cout << "Procesing " << map.size() << " patterns from "
              << classes.size() << " classes" << std::endl;

    processDatasetDir(args.at(2), map);
}

其余功能实现为:

// be smart about case insenstiive patterns
struct Pattern : std::string {
    using std::string::string;
    using std::string::operator=;

#ifdef __cpp_lib_three_way_comparison
    std::weak_ordering operator<=>(Pattern const& other) const {
        if (boost::ilexicographical_compare(*this, other)) {
            return std::weak_ordering::less;
        } else if (boost::ilexicographical_compare(other, *this)) {
            return std::weak_ordering::less;
        }
        return std::weak_ordering::equivalent;
    }
#else
    bool operator<(Pattern const& other) const {
        return boost::ilexicographical_compare(*this, other);
    }
#endif
};

using Paths    = std::vector<fs::path>;
using Mapping  = std::pair<Pattern, fs::path>;
using Patterns = std::set<Pattern>;
using Mappings = std::set<Mapping>;

Mappings readClassMappings(Paths const& classes)
{
    Mappings mappings;
    for (fs::path classname : classes) {
        std::ifstream lblFile(classname);
        classname.replace_extension();

        for (Pattern pattern; getline(lblFile, pattern);) {
            trim(pattern);
            if (auto [it, ok] = mappings.emplace(pattern, classname); !ok) {
                std::cerr << "WARNING: " << std::quoted(pattern)
                          << " duplicates " << std::quoted(it->first)
                          << std::endl;
            }
        }
    }

    return mappings;
}

size_t processDatasetDir(const fs::path& datasetPath, Mappings const& patterns)
{
    size_t copied = 0, failed = 0;
    Patterns found;

    using It = fs::recursive_directory_iterator;
    for (It it = It(datasetPath), end; it != end; ++it) {
        if (!it->is_regular_file())
            continue;

        fs::path const& entry = *it;

        for (auto& [pattern, location]: patterns) {
            if (icontains(it->path().filename().native(), pattern)) {
                found.emplace(pattern);

                if (!exists(location) && fs::create_directories(location))
                    std::cerr << location << " created" << std::endl;

                auto dest = location / entry.filename();

                error_code ec;
                copy(entry, dest, ec);
                std::cerr << dest << " (" << ec.message() << ") from "
                          << std::quoted(pattern) << "\n";

                (ec? failed : copied) += 1;
            }
        }
    }

    std::cout << "Copied:" << copied
              << ", missing:" << patterns.size() - found.size()
              << ", failed: " << failed << std::endl;
    return copied;
}

加上一些更“随机”的测试数据:

mkdir -pv textfiles dataset
touch dataset/{vfv343434,fsdfdsfdfsf,1200E9408000EC0}.jpeg
echo .jPeg > textfiles/all_of_them.txt
echo $'E9408000EC0 \n e9408000ec0\nE9408\nbOgUs' > textfiles/490.txt

运行 为

./a.out textfiles/ dataset/

打印:

WARNING: "e9408000ec0" duplicates "E9408000EC0"
Procesing 4 patterns from 2 classes
"textfiles/all_of_them" created
"textfiles/all_of_them/1200E9408000EC0.jpeg" (Success) from ".jPeg"
"textfiles/490" created
"textfiles/490/1200E9408000EC0.jpeg" (Success) from "E9408"
"textfiles/490/1200E9408000EC0.jpeg" (File exists) from "E9408000EC0"
"textfiles/all_of_them/vfv343434.jpeg" (Success) from ".jPeg"
"textfiles/all_of_them/fsdfdsfdfsf.jpeg" (Success) from ".jPeg"
Copied:4, missing:1, failed: 1