如何组合两个都需要监听端口的 Express 模块？

Question

我正在尝试创建一个网络抓取工具，用户在其中输入 URL 到表单中，当他们点击提交时，抓取工具获取 URL 然后 returns 数据关于我指定的URL。

我的主要 app.js 文件是：

// Dependencies
var express = require('express');
var path = require('path');
var fs = require('fs');

// Custom Libraries - ./ signals to node not to look in the node_modules directory
var scraper = require('./scraper');

// App.js Variables
var app = express();
var viewsPath = path.join(__dirname, '/app/views');
app.use(express.static(__dirname + '/app/public'));

// set the port - 3000
app.set('port', process.env.PORT || 3000);

// Form handling
app.use(require('body-parser').urlencoded({
extended:true }));
app.get('/the_test');
// Writes the domain entered in the form to app/data/domain.txt
app.post('/process', function(request, response){
    var domain = request.body.domain;

    fs.writeFile('app/data/domain.txt', domain, function (err) {
      if (err) return console.log(err);
      console.log('Your domain has been saved!');;
    });

    response.redirect(303, '/results');
});

// Routes require
var routes = require('./routes');
app.use('/', routes);
app.use('/results', routes);

app.listen(app.get('port'), function(){
    console.log('Express started on http://localhost:' + app.get('port') + '; press Ctrl-C to terminate.');
});

我的抓取文件是：

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');

var scraper = express();
// Scrape the url that was posted
scraper.get('/scrape', function(req, res){
  // Scrape this
  var url = fs.readFileSync('./app/data/domain.txt', 'utf8');

  request(url, function(error, response, html){
    if(!error){
      var $ = cheerio.load(html);
      var header;
      var json = { header : ""};

      $('.hero-message').filter(function(){
        var data = $(this);
        header = data.children().first().text();

        json.header = header;

      });
    } else {
      console.log(error);
    }

    fs.writeFile('./app/data/results.json', JSON.stringify(json, null, 4), function(err){
      console.log('File successfully written! - Check your project directory for the output.json file');
    });

    res.send('Check your console!')
  });
});

scraper.listen(4000);
console.log('Magic happens on port 4000');
exports = module.exports = scraper;

当我转到 localhost:3000 时，用户可以输入 URL 并点击提交，他们将被重定向到 localhost:3000/results 而 URL 是登录 data/domain.txt.

当我去 localhost:4000/scrape 时，scraper 激活，从 domain.txt 抓取域并抓取它。

我的问题是如何制作这个流畅的程序 and/or 如何自动激活抓取工具而不是每次都去 localhost:4000/抓取？我是 Node.js 和 Express 的新手，我意识到这是很多难看的代码。

如有任何提示，我们将不胜感激。

Answer 1

没有必要为您尝试做的事情保留两个单独的进程。你可以做的是移动 scraper 动作

scraper.get("/scrape", function (req, res) {
    // code
});

到主 app.js 文件并提供来自端口 3000 的所有内容，确保在主文件中包含来自 scraper 的所有依赖项。此时，您可能想了解如何使用 node's module system 来保持代码的分离和组织。

根据您的抓取过程需要多长时间，您可以执行以下操作之一：

更改 process 操作以执行 scrape 操作目前执行的工作，因此将域写入文件然后转到另一个 url从该文件中读取并开始该过程，您捕获域并立即提供给抓取工具。
如果 scraper 需要很长时间并且您想自动启动抓取作业，您不希望它阻塞您的应用程序或在请求期间抛出超时.您应该考虑实施工作队列机制。有很多方法可以做到这一点，正确的解决方案在很大程度上取决于应用程序的预期用例。

如何组合两个都需要监听端口的 Express 模块？

How do I combine two Express modules that both require listening to the port?

node.js

express

cheerio