计算多个文件的MD5哈希值

Question

我的目标是用它们的 MD5 哈希重命名目录中的所有文件，以便更容易地检查重复项。

我目前有大约 30,000 个文件要处理，但是，在对小批量文件进行测试以确保代码正常工作后，我运行遇到了这个错误

Error: EMFILE: too many open files ...

是的，我已经尝试研究这个问题和其他多个类似问题。 node and Error: EMFILE, too many open files

我认为这与我打开文件的方式有关，也与异步操作有关，但是，我不知道如何以这种方式正确编码。

这是我解决这个问题的最初尝试。

const md5hashtable = [];
async function processFilesMD5(routePath) {
    // Get files/folders in path
    await fs.readdirSync(routePath).forEach((file) => {
        const filepath = path.join(routePath, file);
        // Check if folder is dir to do tree walk
        fs.stat(filepath, async (err, stat) => {
            if (stat.isDirectory()) {
                await processFilesMD5(filepath);
            // Calculate md5 of file
            } else {
                let filename = path.basename(filepath).replace(path.extname(filepath), "")
                if (RegExp('^[a-f0-9]{32}$', 'gm').test(filename)){
                    if (md5hashtable.includes(filename)){
                        console.log(`\nFound dup: ${filename} loc: ${filepath}\n`)
                        fs.unlinkSync(filepath)
                    } else {
                        if (!(path.basename(filepath) === `${filename}${path.extname(filepath)}`)){
                            fs.renameSync(filepath, `${filepath.replace(path.basename(filepath), "")}${filename}${path.extname(filepath)}`)
                        }
                        md5hashtable.push(filename)
                    }
                    countProcess++;
                } else {
                    countProcess++;
                    countOpen++;
                    let hash = crypto.createHash('md5')
                    let stream = fs.createReadStream(filepath)
                    console.log(`Created Stream with ID: ${countOpen}`)
                    await stream.on('data', function (data) {
                        hash.update(data, 'utf8')
                        countRead++;
                        // console.log(`Reading Stream with chunk ID: ${countRead}`)
                    })
                    await stream.on('end', function () {
                        countClose++;
                        // console.log(`Closing Steam with ID: ${countClose}`)
                        const md5name = hash.digest('hex')
                        if (md5hashtable.includes(md5name)){
                            console.log(`\nFound dup: ${md5name} loc: ${filepath}\n`)
                            fs.unlinkSync(filepath)
                        } else {
                            if (!(path.basename(filepath) === `${md5name}${path.extname(filepath)}`)){
                                fs.renameSync(filepath, `${filepath.replace(path.basename(filepath), "")}${md5name}${path.extname(filepath)}`)
                            }
                            md5hashtable.push(md5name)
                        }
                        console.log(`File: ${filepath} has hash: ${md5name}`)
                        stream.destroy()
                    })
                }
            }
        });
    });
    console.log(`Current Route: ${routePath}\nTotal files processed: ${countProcess}\nFiles Opened: ${countOpen}\nChunks Read: ${countRead}\nFiles Closed: ${countClose}`)
}

processFilesMD5(`${path.join(__dirname, 'media')}`).then(() => {
    console.log('Done')
})

这是我第二次尝试解决此问题，为了简单起见，我还对其进行了清理。

const md5hashtable = [];

function calculateMD5(filepath) {
    let hash = crypto.createHash('md5')
    let stream = fs.createReadStream(filepath)
    console.log(`Created Stream`)

    stream.on('data', function (data) {
        hash.update(data, 'utf8')
        console.log(`Reading Stream`)
    })

    stream.on('end', function () {
        const MD5hash = hash.digest('hex')
        if (dupHashCheck(MD5hash)){ // Hash already exsit
            console.log(`\nFound dup: ${filename} loc: ${filepath}\n`) 
            fs.unlink(filepath) // Deletes duplicate
        } else { // Hash doest not exsit
            md5hashtable.push(md5name)
        }
        console.log(`File: ${filepath}\nHash: ${md5name}\n`)
        stream.destroy()
        console.log(`Closing Steam`)
    })
}

function validateMD5(hash){
    return RegExp('^[a-f0-9]{32}$', 'gm').test(hash);
}

function dupHashCheck(hash){
    return md5hashtable.includes(hash)
}

function processImageRoute(routePath) {
    fs.readdir(routePath, (err, files) => { // Get files in path
        files.forEach(file => {
            let filepath = path.join(routePath, file); // Join root dir with path of folder
            fs.stat(filepath, async (err, stat) => { // Get stats of dir
                if (stat.isDirectory()) { // If dir is folder, run recursivley
                    processImageRoute(filepath);
                        } else { // Continue
                            let filename = path.basename(filepath).replace(path.extname(filepath), "") // Get filename without extension
                            if (validateMD5(filename)){ // Filename is a valid md5 hash
                                if (dupHashCheck(filename)){ // Hash already exsit
                                    console.log(`\nFound dup: ${filename} loc: ${filepath}\n`) 
                                    fs.unlink(filepath) // Deletes duplicate
                                } else { // Hash doest not exsit
                                    md5hashtable.push(filename)
                                }
                            } else { // Isnt a valid md5 hash
                                calculateMD5(filepath)
                            }
                        }
                    })
        })
    })
}

processImageRoute(`${path.join(__dirname, 'media')}`)

这两个代码都不起作用，因为它们打开的文件太多，但在小批量情况下，它们可以完美运行。另外，这是我的第一个问题，我愿意接受任何建议和评论。

Answer 1

按照@codeness93 的说法promisifying the code；我做了这个

global.fs = require('fs-extra');

const md5hashtable = [];

function calculateMD5(filePath) {
    return new Promise((resolve, reject) => {
        let hash = crypto.createHash('md5')
        let stream = fs.createReadStream(filePath)

        stream.on('error', function (err) {
            reject(err);
        })

        stream.on('data', function (data) {
            hash.update(data, 'utf8')
        })

        stream.on('end', function () {
            stream.close();
            resolve(hash.digest('hex'));
        })
    });
}

function validateMD5(hash){
    return RegExp('^[a-f0-9]{32}$', 'gm').test(hash);
}

function dupHashCheck(hash){
    return md5hashtable.includes(hash)
}

function renameFile(filePath, fileHash){
    try {
        fs.renameSync(filePath, `${filePath.replace(path.basename(filePath), "")}${fileHash}${path.extname(filePath)}`)
    } catch (e){
        throw new Error(e)
    }
}

function processImageRoute(routePath) {
    fs.readdir(routePath, (err, files) => { // Get files in path
        files.forEach(file => {
            let filePath = path.join(routePath, file); // Join root dir with path of folder
            fs.stat(filePath, async (err, stat) => { // Get stats of dir
                if (stat.isDirectory()) { // If dir is folder, run recursively
                    processImageRoute(filePath);
                } else { // Continue
                    let fileName = path.basename(filePath).replace(path.extname(filePath), "") // Get fileName without extension
                    if (validateMD5(fileName)){ // fileName is a valid md5 hash
                        if (dupHashCheck(fileName)){ // Hash already exist
                            console.log(`\nFound dup: ${fileName} loc: ${filePath}\n`)
                            fs.unlink(filePath) // Deletes duplicate
                            } else { // Hash doest not exist
                                md5hashtable.push(fileName)
                            }
                    } else { // Isn't a valid md5 hash
                        await calculateMD5(filePath).then(function(fileHash){
                            if (validateMD5(fileHash)){
                                if (dupHashCheck(fileHash)){ // Hash already exist
                                    console.log(`\nFound dup: ${fileName} loc: ${filePath}\n`)
                                    fs.unlink(filePath) // Deletes duplicate
                                } else { // Hash doest not exist
                                    renameFile(filePath, fileHash); // Renames the file to its hash plus extension
                                    md5hashtable.push(fileHash)
                                }
                                console.log(`File: ${filePath}\nHash: ${fileHash}\n`)
                            } else {
                                throw new Error(`Unable to calculate hash for file: ${fileName}\nError: ${fileHash}\n`)
                            }
                        })
                    }
                }
            })
        })
    })
}

processImageRoute(`${path.join(__dirname, 'media')}`)

我不确定它是否添加了承诺从而增加了打开流的延迟而不是读取它们然后关闭它们，或者将 fs 替换为 fs-extra 使其工作，或两者兼而有之，或魔尘，但它有效。

最后它能够处理所有 29088 Files 总计 400 GBs 所以我称之为成功。欢迎使用，或提出建议。

计算多个文件的MD5哈希值

Calculating MD5 hashes of multiple files

md5

asynchronous

node.js

express