为什么向基于 epoll 的网络爬虫添加 MySQL 代码会减慢它的速度？

Question

我有一个使用 libcurl 和 epoll 来抓取网站的工作代码：

https://github.com/JamesRead5737/libcurlmemoryleak/blob/master/crawler.c

典型输出为：

Parsed sites: 0, 1024 parallel connections, 10989 still running, 10989 transfers    Exiting normally.
Parsed sites: 0, 0 parallel connections, 0 still running, 0 transfersrsrFinished all in progress downloads.
Exiting.

如您所见，代码达到了 1024 个并行连接的硬编码限制。这在我的专用服务器上最多消耗 3 Gbps。

像这样添加 Mysql 代码：

#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/timerfd.h>
#include <sys/types.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <time.h>
#include <unistd.h>
#include <pthread.h>
#include <netdb.h>
#include <time.h>
#include <netinet/in.h>
#include <mysql.h> 
#include <curl/curl.h>

#define MSG_OUT stdout
#define DEFAULT_QUEUE_LENGTH 10000
#define mycase(code) \
        case code: s = __STRING(code)

#define MAX_CONNECTIONS 1024

MYSQL *mysql_con;

/* Global information, common to all connections */
typedef struct _GlobalInfo
{
    int epfd;    /* epoll filedescriptor */
    int tfd;     /* timer filedescriptor */
    CURLM *multi;
    int still_running;
    pthread_mutex_t lock;
    int concurrent_connections;
    pthread_mutex_t parsed_lock;
    int parsed_sites;
    int transfers;
} GlobalInfo;

int new_body_conn(char *url, GlobalInfo *g);

/* Information associated with a specific easy handle */
typedef struct _ConnInfo
{
    CURL *easy;
    char *url;
    GlobalInfo *global;
    char error[CURL_ERROR_SIZE];
    size_t size;
    char *data;
} ConnInfo;

/* Information associated with a specific socket */
typedef struct _SockInfo
{
    curl_socket_t sockfd;
    CURL *easy;
    int action;
    long timeout;
    GlobalInfo *global;
} SockInfo;

void
mysql_stop()
{
    mysql_close(mysql_con);
}

void
mysql_start()
{
    mysql_con = mysql_init(NULL);
    if (mysql_con == NULL)
    {
        fprintf(stderr, "%s\n", mysql_error(mysql_con));
        exit(1);
    }

    if (mysql_real_connect(mysql_con, "localhost", "crawler", "password", "crawl", 0, NULL, 0) == NULL)
    {
        fprintf(stderr, "%s\n", mysql_error(mysql_con));
        exit(EXIT_FAILURE);
    }

    if (mysql_query(mysql_con, "CREATE TABLE IF NOT EXISTS `frontier` (`url` varchar(2084) NOT NULL, `id` int NOT NULL AUTO_INCREMENT, PRIMARY KEY (`id`), UNIQUE KEY `url` (`url`), KEY `url_2` (`url`)) ENGINE=InnoDB AUTO_INCREMENT=1"))
    {
        fprintf(stderr, "%s\n", mysql_error(mysql_con));
        mysql_stop();
                exit(1);
    }

    if (mysql_query(mysql_con, "CREATE TABLE IF NOT EXISTS `crawled` (`id` int NOT NULL AUTO_INCREMENT, `url` varchar(2084) DEFAULT NULL, `title` varchar(768) DEFAULT NULL, `date` varchar(128) DEFAULT NULL, `last_modified` varchar(128) DEFAULT NULL, `links` int DEFAULT NULL, `backlinks` int DEFAULT NULL, `http_code` int DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `url` (`url`), KEY `http_code` (`http_code`), KEY `title` (`title`)) ENGINE=InnoDB AUTO_INCREMENT=1"))
    {
        fprintf(stderr, "%s\n", mysql_error(mysql_con));
                mysql_stop();
                exit(1);
    }

    if (mysql_query(mysql_con, "CREATE TABLE IF NOT EXISTS `emails` (`email` varchar(2084) NOT NULL, `id` int NOT NULL AUTO_INCREMENT, PRIMARY KEY (`id`), UNIQUE KEY `email` (`email`), KEY `email_2` (`email`)) ENGINE=InnoDB AUTO_INCREMENT=737 DEFAULT CHARSET=latin1"))
    {
        fprintf(stderr, "%s\n", mysql_error(mysql_con));
                mysql_stop();
                exit(1);
    }
}

void
mysql_url_visited_push(char *url, char *title)
{
    char sql[8192];
    char escaped_url[(strlen(url)*2)+1];
    if (!mysql_real_escape_string(mysql_con, escaped_url, url, strlen(url)))
    {
    }
    if (title != NULL)
    {
        char escaped_title[(strlen(title)*2)+1];
        if (!mysql_real_escape_string(mysql_con, escaped_title, title, strlen(title)))
        {
        }
        sprintf(sql, "INSERT INTO crawled (url, title) VALUES ('%s', '%s')", escaped_url, escaped_title);
    } else {
        sprintf(sql, "INSERT INTO crawled (url, title) VALUES ('%s', '%s')", escaped_url, title);
    }
    if (mysql_query(mysql_con, sql))
    {
        fprintf(stderr, "mysql_url_visited_push sql=%s %s\n", sql, mysql_error(mysql_con));
    }
}

int
mysql_url_visited_find(char *url)
{
    char sql[8192];
    char escaped_url[(strlen(url)*2)+1];
    if (!mysql_real_escape_string(mysql_con, escaped_url, url, strlen(url)))
    {
    }
    sprintf(sql, "SELECT * FROM crawled WHERE url = '%s'", escaped_url);
    if (mysql_query(mysql_con, sql))
    {
        fprintf(stderr, "mysql_url_visited_find sql=%s %s\n", sql, mysql_error(mysql_con));
    } else {
        MYSQL_RES *result = mysql_store_result(mysql_con);
        if (!result)
        {
            fprintf(stderr, "%s\n", mysql_error(mysql_con));
        } else if (result > 0){
            mysql_free_result(result);
            return 1;
        } else if (result == 0) {
            mysql_free_result(result);
            return 0;
        }
    }
}

int
mysql_url_frontier_find(char *url)
{
    char sql[8192];
    char escaped_url[(strlen(url)*2)+1];
    if (!mysql_real_escape_string(mysql_con, escaped_url, url, strlen(url)))
    {
    }
        sprintf(sql, "SELECT * FROM frontier WHERE url = '%s'", escaped_url);
        if (mysql_query(mysql_con, sql))
        {
                fprintf(stderr, "mysql_url_frontier_find sql=%s %s\n", sql, mysql_error(mysql_con));
        } else {
                MYSQL_RES *result = mysql_store_result(mysql_con);      
                if (!result)
                {
                        fprintf(stderr, "%s\n", mysql_error(mysql_con));
                } else if (result > 0){
            mysql_free_result(result);
                        return 1;
                } else if (result == 0) {
            mysql_free_result(result);
                        return 0;
                }
        }
}

void 
mysql_url_frontier_push(char *url)
{
    char sql[8192];
    char escaped_url[(strlen(url)*2)+1];
    if (!mysql_real_escape_string(mysql_con, escaped_url, url, strlen(url)))
    {
    }
        sprintf(sql, "INSERT IGNORE INTO frontier (url) VALUES ('%s')", escaped_url);
        if (mysql_query(mysql_con, sql))
        {
                fprintf(stderr, "mysql_url_frontier_push sql=%s %s\n", sql, mysql_error(mysql_con));
        }
}

char *
mysql_url_frontier_pop()
{
    char *url = NULL;
    char sql[8192];
    if (mysql_query(mysql_con, "SELECT url FROM frontier ORDER BY id") == 0)
    {
        MYSQL_ROW row;
        MYSQL_RES *result = mysql_store_result(mysql_con);
        if (result == NULL)
        {
            fprintf(stderr, "mysql_url_frontier_pop mysql_store_result sql=%s %s\n", sql, mysql_error(mysql_con));
            exit(EXIT_FAILURE);
        }
        if ((row = mysql_fetch_row(result)))
        {
            url = strdup(row[0]);
            char escaped_url[(strlen(url)*2)+1];
            if (!mysql_real_escape_string(mysql_con, escaped_url, url, strlen(url)))
            {
            }
            sprintf( sql, "DELETE FROM frontier WHERE url = '%s'", escaped_url);
            if (mysql_query(mysql_con, sql))
            {
                fprintf(stderr, "mysql_url_frontier_pop mysql_query sql=%s %s\n", sql, mysql_error(mysql_con));
                exit(EXIT_FAILURE);
            }
        }
        mysql_free_result(result);
    }
    return url;
}

int
starts_with(const char *str, const char *pre)
{
        size_t lenstr;
        size_t lenpre;

        if (str == NULL || pre == NULL)
                return (-1);

        lenstr = strlen(str);
        lenpre = strlen(pre);

        if (lenstr < lenpre)
                return (-1);

    return (memcmp(pre, str, lenpre));
}

char *
url_sanitize(char *base_url, char *url, int size)
{
        char *newurl;
        int base_url_len = strlen(base_url);

        if (starts_with(url, "http") == 0) {
                newurl = malloc(size+1);
                if (newurl == NULL) {
                        fprintf(stderr, "1 malloc() of %d bytes, failed\n", size);
                        exit(1);
                }

                strncpy(newurl, url, size);
                newurl[size] = '[=12=]';

        } else {
                if (starts_with(url, "//") == 0) {
                        newurl = malloc(size+7);
                        if (newurl == NULL) {
                                fprintf(stderr, "2 malloc() of %d bytes, failed\n", size);
                                exit(1);
                        }

                        strncpy(newurl, "https:", 6);
                        strncpy(newurl+6, url, size);
                        newurl[size+6] = '[=12=]';
                } else {
                        newurl = malloc(base_url_len + size + 2);
                        if (newurl == NULL) {
                                fprintf(stderr, "3 malloc() of %d bytes, failed\n", size);
                                exit(1);
                        }

                        strncpy(newurl, base_url, base_url_len);
                        strncpy(newurl + base_url_len, url, size);
                        newurl[size + base_url_len] = '[=12=]';
                }
        }

        return (newurl);
}

char *
html_title_find(char *html)
{
        char *newurl, *first, *last;
        int size = 0;

        first = strstr(html, "<title>");
        if (first == NULL)
                return (NULL);

        first += strlen("<title>");

        last = strstr(first, "</title>");
        if (last == NULL)
                return (NULL);

        size = last - first;

        newurl = malloc(size+1);
        if (newurl == NULL) {
                fprintf(stderr, "4 malloc() of %d bytes, failed\n", size);
                exit(1);
        }

        strncpy(newurl, first, size);
        newurl[size] = '[=12=]';

        return (newurl);
}

void
html_link_find(char *url, char *html)
{
        char *first, *last, *newurl;
        int size = 0;

        first = html;

        while (first && last) {
                        first = strstr(first, "href=\"");
                        if (first == NULL)
                                continue;

                        first += strlen("href=\"");

                        last = strchr(first, '\"');
                        if (last == NULL)
                                continue;

                        size = last - first;

                        newurl = url_sanitize(url, first, size);

                        if (strstr(newurl, "mailto")) {
                                free(newurl);
                                continue;
                        } else {
                if (mysql_url_visited_find(newurl) && mysql_url_frontier_find(newurl)) {
                    mysql_url_frontier_push(newurl);
                }
                free(newurl);
            }

        }
}

void
parsed_sites_inc(GlobalInfo *g)
{
        g->parsed_sites++;
}

void
html_parse(char *url, char *html)
{
    char *title;

    title = html_title_find(html);
    html_link_find(url, html);
    mysql_url_visited_push(url, title);

    free(title);
}

/* Die if we get a bad CURLMcode somewhere */ 
static void
mcode_or_die(const char *where, CURLMcode code)
{
    if (CURLM_OK != code) {
        const char *s;

        switch (code) {
            mycase(CURLM_BAD_HANDLE); break;
            mycase(CURLM_BAD_EASY_HANDLE); break;
            mycase(CURLM_OUT_OF_MEMORY); break;
            mycase(CURLM_INTERNAL_ERROR); break;
            mycase(CURLM_UNKNOWN_OPTION); break;
            mycase(CURLM_LAST); break;
            default: s = "CURLM_unknown"; break;
            mycase(CURLM_BAD_SOCKET);
            fprintf(MSG_OUT, "ERROR: %s returns %s\n", where, s);
            /* ignore this error */ 
            return;
        }

        fprintf(MSG_OUT, "ERROR: %s returns %s\n", where, s);
        exit(code);
    }
}

void
print_progress(GlobalInfo *g)
{
    printf("\rParsed sites: %d, %d parallel connections, %d still running, %d transfers\t", 
            g->parsed_sites, g->concurrent_connections, g->still_running, g->transfers);
    fflush(stdout);
}

void
transfers_inc(GlobalInfo *g)
{
    g->transfers++;

    print_progress(g);
}

void
transfers_dec(GlobalInfo *g)
{
    g->transfers--;

    print_progress(g);
}

void
concurrent_connections_inc(GlobalInfo *g)
{
    g->concurrent_connections++;

    print_progress(g);
}

void
concurrent_connections_dec(GlobalInfo *g)
{
    g->concurrent_connections--;

    print_progress(g);
}

static void timer_cb(GlobalInfo* g, int revents);

/* Update the timer after curl_multi library does it's thing. Curl will
 * inform us through this callback what it wants the new timeout to be,
 * after it does some work. */ 
static int
multi_timer_cb(CURLM *multi, long timeout_ms, GlobalInfo *g)
{
    struct itimerspec its;

    //fprintf(MSG_OUT, "multi_timer_cb: Setting timeout to %ld ms\n", timeout_ms);

    if (timeout_ms > 0) {
        its.it_interval.tv_sec = 1;
        its.it_interval.tv_nsec = 0;
        its.it_value.tv_sec = timeout_ms / 1000;
        its.it_value.tv_nsec = (timeout_ms % 1000) * 1000 * 1000;
    } else if(timeout_ms == 0) {
        /* libcurl wants us to timeout now, however setting both fields of
         * new_value.it_value to zero disarms the timer. The closest we can
         * do is to schedule the timer to fire in 1 ns. */ 
        its.it_interval.tv_sec = 1;
        its.it_interval.tv_nsec = 0;
        its.it_value.tv_sec = 0;
        its.it_value.tv_nsec = 1;
    } else {
        memset(&its, 0, sizeof(struct itimerspec));
    }

    timerfd_settime(g->tfd, /*flags=*/ 0, &its, NULL);

    return (0);
}

/* Check for completed transfers, and remove their easy handles */ 
static void
check_multi_info(GlobalInfo *g)
{
    char *eff_url;
    CURLMsg *msg;
    int msgs_left;
    ConnInfo *conn;
    CURL *easy;
    char *ct;
    double time;
    double dl;
    long header_size;
    long response_code;
    //CURLcode res;

    while ((msg = curl_multi_info_read(g->multi, &msgs_left))) {
        if (msg->msg == CURLMSG_DONE) {
            easy = msg->easy_handle;
            //res = msg->data.result;
            curl_easy_getinfo(easy, CURLINFO_PRIVATE, &conn);
            curl_easy_getinfo(easy, CURLINFO_EFFECTIVE_URL, &eff_url);
            curl_easy_getinfo(easy, CURLINFO_CONTENT_TYPE, &ct);
            curl_easy_getinfo(easy, CURLINFO_TOTAL_TIME, &time);
            curl_easy_getinfo(easy, CURLINFO_SIZE_DOWNLOAD, &dl);
            curl_easy_getinfo(easy, CURLINFO_RESPONSE_CODE, &response_code);
            curl_easy_getinfo(easy, CURLINFO_HEADER_SIZE, &header_size);

            if (response_code == 200 && dl == 0.0 && (starts_with(ct, "text/html") || starts_with(ct, "text/plain")))
            {
                /* This should be a response to our HEAD request */
                //printf("200 %s header size: %ld download size: %f", eff_url, header_size, dl);
                new_body_conn(eff_url, g);

            } else if (response_code == 200 && dl > 0.0 && (starts_with(ct, "text/html") || starts_with(ct, "text/plain"))){
                /* This should be a response to our GET request */
                //printf("%ld %s download size: %f content type: %s\n", response_code, eff_url, dl, ct);
                html_parse(eff_url, conn->data);
                parsed_sites_inc(g);
            }
            //fprintf(MSG_OUT, "DONE: %s => (%d) %s\n", eff_url, res, conn->error);

            curl_multi_remove_handle(g->multi, easy);
            //free(conn->url);
            free(conn->data);
            curl_easy_cleanup(easy);
            transfers_dec(g);
            free(conn);
        }
    }
}

/* Called by libevent when we get action on a multi socket filedescriptor*/ 
static void
event_cb(GlobalInfo *g, int fd, int revents)
{
    CURLMcode rc;
    struct itimerspec its;

    int action = ((revents & EPOLLIN) ? CURL_CSELECT_IN : 0) |
                 ((revents & EPOLLOUT) ? CURL_CSELECT_OUT : 0);

    rc = curl_multi_socket_action(g->multi, fd, action, &g->still_running);
    mcode_or_die("event_cb: curl_multi_socket_action", rc);

    check_multi_info(g);

    if (g->still_running <= 0) {
        //fprintf(MSG_OUT, "last transfer done, kill timeout\n");
        memset(&its, 0, sizeof(struct itimerspec));
        timerfd_settime(g->tfd, 0, &its, NULL);
    }
}

/* Called by main loop when our timeout expires */ 
static void
timer_cb(GlobalInfo* g, int revents)
{
    CURLMcode rc;
    uint64_t count = 0;
    ssize_t err = 0;

    err = read(g->tfd, &count, sizeof(uint64_t));
    if (err == -1) {
        /* Note that we may call the timer callback even if the timerfd isn't
         * readable. It's possible that there are multiple events stored in the
         * epoll buffer (i.e. the timer may have fired multiple times). The
         * event count is cleared after the first call so future events in the
         * epoll buffer will fail to read from the timer. */ 
        if (errno == EAGAIN) {
            //fprintf(MSG_OUT, "EAGAIN on tfd %d\n", g->tfd);
            return;
        }
    }

    if (err != sizeof(uint64_t)) {
        fprintf(stderr, "read(tfd) == %ld", err);
        perror("read(tfd)");
    }

    rc = curl_multi_socket_action(g->multi, CURL_SOCKET_TIMEOUT, 0, &g->still_running);
    mcode_or_die("timer_cb: curl_multi_socket_action", rc);
    check_multi_info(g);
}

/* Assign information to a SockInfo structure */ 
static void
setsock(SockInfo *f, curl_socket_t s, CURL *e, int act, GlobalInfo *g)
{
    struct epoll_event ev;
    int kind = ((act & CURL_POLL_IN) ? EPOLLIN : 0) |
               ((act & CURL_POLL_OUT) ? EPOLLOUT : 0);

    if (f->sockfd) {
        concurrent_connections_dec(g);
        if (epoll_ctl(g->epfd, EPOLL_CTL_DEL, f->sockfd, NULL))
            fprintf(stderr, "EPOLL_CTL_DEL failed for fd: %d : %s\n",
              f->sockfd, strerror(errno));
    }

    f->sockfd = s;
    f->action = act;
    f->easy = e;

    ev.events = kind;
    ev.data.fd = s;

    concurrent_connections_inc(g);
    if (epoll_ctl(g->epfd, EPOLL_CTL_ADD, s, &ev)) {
        fprintf(stderr, "EPOLL_CTL_ADD failed for fd: %d : %s\n",
          s, strerror(errno));
    }
}

/* Initialize a new SockInfo structure */ 
static void
addsock(curl_socket_t s, CURL *easy, int action, GlobalInfo *g)
{
    SockInfo *fdp = (SockInfo *)calloc(sizeof(SockInfo), 1);

    fdp->global = g;
    setsock(fdp, s, easy, action, g);
    curl_multi_assign(g->multi, s, fdp);
}

static size_t
write_cb(void *contents, size_t size, size_t nmemb, void *p)
{
    ConnInfo *conn = (ConnInfo *)p;
    size_t realsize = size * nmemb;

    conn->data = realloc(conn->data, conn->size + realsize + 1);
    if (conn->data == NULL) {
        /* out of memory! */ 
        printf("not enough memory (realloc returned NULL)\n");
        return 0;
    }

    memcpy(&(conn->data[conn->size]), contents, realsize);
    conn->size += realsize;
    conn->data[conn->size] = 0;

    return realsize;
}

/* Create a new easy handle, and add it to the global curl_multi */ 
int
new_head_conn(char *url, GlobalInfo *g)
{
    ConnInfo *conn;
    CURLMcode rc;

    conn = (ConnInfo*)calloc(1, sizeof(ConnInfo));
    conn->error[0]='[=12=]';
    conn->global = g;

    conn->easy = curl_easy_init();
    if (!conn->easy) {
        fprintf(MSG_OUT, "curl_easy_init() failed, exiting!\n");
        exit(2);
    }
    transfers_inc(g);

    conn->global = g;
    conn->url = url;
    curl_easy_setopt(conn->easy, CURLOPT_URL, conn->url);
    curl_easy_setopt(conn->easy, CURLOPT_WRITEFUNCTION, write_cb);
    curl_easy_setopt(conn->easy, CURLOPT_WRITEDATA, conn);
    curl_easy_setopt(conn->easy, CURLOPT_FOLLOWLOCATION, 1L);
    curl_easy_setopt(conn->easy, CURLOPT_ERRORBUFFER, conn->error);
    curl_easy_setopt(conn->easy, CURLOPT_PRIVATE, conn);
    curl_easy_setopt(conn->easy, CURLOPT_NOPROGRESS, 1L);
    curl_easy_setopt(conn->easy, CURLOPT_PROGRESSDATA, conn);
    curl_easy_setopt(conn->easy, CURLOPT_FOLLOWLOCATION, 1L);
    curl_easy_setopt(conn->easy, CURLOPT_LOW_SPEED_TIME, 3L);
    curl_easy_setopt(conn->easy, CURLOPT_LOW_SPEED_LIMIT, 100L);
    curl_easy_setopt(conn->easy, CURLOPT_CONNECTTIMEOUT, 10L);
    curl_easy_setopt(conn->easy, CURLOPT_CLOSESOCKETDATA, g);
    curl_easy_setopt(conn->easy, CURLOPT_NOBODY, 1L);

    rc = curl_multi_add_handle(g->multi, conn->easy);
    mcode_or_die("new_conn: curl_multi_add_handle", rc);

    /* note that the add_handle() will set a time-out to trigger very soon so
     that the necessary socket_action() call will be called by this app */ 

    return (0);
}

/* Create a new easy handle, and add it to the global curl_multi */
int
new_body_conn(char *url, GlobalInfo *g)
{
        ConnInfo *conn;
        CURLMcode rc;

        conn = (ConnInfo*)calloc(1, sizeof(ConnInfo));
        conn->error[0]='[=12=]';
        conn->global = g;

        conn->easy = curl_easy_init();
        if (!conn->easy) {
                fprintf(MSG_OUT, "curl_easy_init() failed, exiting!\n");
                exit(2);
        }
        transfers_inc(g);

        conn->global = g;
        conn->url = url;
        curl_easy_setopt(conn->easy, CURLOPT_URL, conn->url);
        curl_easy_setopt(conn->easy, CURLOPT_WRITEFUNCTION, write_cb);
        curl_easy_setopt(conn->easy, CURLOPT_WRITEDATA, conn);
        curl_easy_setopt(conn->easy, CURLOPT_FOLLOWLOCATION, 1L);
        curl_easy_setopt(conn->easy, CURLOPT_ERRORBUFFER, conn->error);
        curl_easy_setopt(conn->easy, CURLOPT_PRIVATE, conn);
        curl_easy_setopt(conn->easy, CURLOPT_NOPROGRESS, 1L);
        curl_easy_setopt(conn->easy, CURLOPT_PROGRESSDATA, conn);
        curl_easy_setopt(conn->easy, CURLOPT_FOLLOWLOCATION, 1L);
        curl_easy_setopt(conn->easy, CURLOPT_LOW_SPEED_TIME, 3L);
        curl_easy_setopt(conn->easy, CURLOPT_LOW_SPEED_LIMIT, 100L);
        curl_easy_setopt(conn->easy, CURLOPT_CONNECTTIMEOUT, 10L);
        curl_easy_setopt(conn->easy, CURLOPT_CLOSESOCKETDATA, g);

        rc = curl_multi_add_handle(g->multi, conn->easy);
        mcode_or_die("new_conn: curl_multi_add_handle", rc);

        /* note that the add_handle() will set a time-out to trigger very soon so
     that the necessary socket_action() call will be called by this app */

        return (0);
}

/* Clean up the SockInfo structure */ 
static void
remsock(SockInfo *f, GlobalInfo* g)
{
    if (f) {
        if (f->sockfd) {
            concurrent_connections_dec(g);
            if (epoll_ctl(g->epfd, EPOLL_CTL_DEL, f->sockfd, NULL))
                fprintf(stderr, "EPOLL_CTL_DEL failed for fd: %d : %s\n",
                  f->sockfd, strerror(errno));
        }

        free(f);
    }
}

/* CURLMOPT_SOCKETFUNCTION */ 
static int
sock_cb(CURL *e, curl_socket_t s, int what, void *cbp, void *sockp)
{
    GlobalInfo *g = (GlobalInfo*) cbp;
    SockInfo *fdp = (SockInfo*) sockp;

    if (what == CURL_POLL_REMOVE) {
        remsock(fdp, g);
    } else {
        if (g->concurrent_connections < MAX_CONNECTIONS){
            if (!fdp) {
                addsock(s, e, what, g);
            } else {
                setsock(fdp, s, e, what, g);
            }
        }
    }

    return (0);
}

/* CURLMOPT_SOCKETFUNCTION */
static int
end_sock_cb(CURL *e, curl_socket_t s, int what, void *cbp, void *sockp)
{
        GlobalInfo *g = (GlobalInfo*) cbp;
        SockInfo *fdp = (SockInfo*) sockp;

        if (what == CURL_POLL_REMOVE) {
                remsock(fdp, g);
        }

        return (0);
}


int should_exit = 0;

void
signal_handler(int signo)
{
    should_exit = 1;
}

void *
crawler_init()
{
    GlobalInfo g;
    struct itimerspec its;
    struct epoll_event ev;
    struct epoll_event events[10000];


    memset(&g, 0, sizeof(GlobalInfo));

    g.transfers = 0;
    g.parsed_sites = 0;

    g.epfd = epoll_create1(EPOLL_CLOEXEC);
    if (g.epfd == -1) {
        perror("epoll_create1 failed\n");
        exit(1);
    }

    g.tfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC);
    if (g.tfd == -1) {
        perror("timerfd_create failed\n");
        exit(1);
    }

    memset(&its, 0, sizeof(struct itimerspec));
    its.it_interval.tv_sec = 1;
    its.it_value.tv_sec = 1;
    timerfd_settime(g.tfd, 0, &its, NULL);

    ev.events = EPOLLIN;
    ev.data.fd = g.tfd;
    epoll_ctl(g.epfd, EPOLL_CTL_ADD, g.tfd, &ev);

    curl_global_init(CURL_GLOBAL_DEFAULT);
    g.multi = curl_multi_init();

    /* setup the generic multi interface options we want */ 
    curl_multi_setopt(g.multi, CURLMOPT_SOCKETFUNCTION, sock_cb);
    curl_multi_setopt(g.multi, CURLMOPT_SOCKETDATA, &g);
    curl_multi_setopt(g.multi, CURLMOPT_TIMERFUNCTION, multi_timer_cb);
    curl_multi_setopt(g.multi, CURLMOPT_TIMERDATA, &g);

    /* we don't call any curl_multi_socket*() function yet as we have no handles added! */ 

    //printf("Starting crawler...\n");

    while (!should_exit) {
        int idx;
        int err = epoll_wait(g.epfd, events, sizeof(events)/sizeof(struct epoll_event), 10000);
        char *url;

        url = mysql_url_frontier_pop();

        new_head_conn(url, &g);

        if (err == -1) {
            if (errno == EINTR) {
                fprintf(MSG_OUT, "note: wait interrupted\n");
                continue;
            } else {
                perror("epoll_wait");
                exit(1);
            }
        }

        for (idx = 0; idx < err; ++idx) {
            if (events[idx].data.fd == g.tfd) {
                timer_cb(&g, events[idx].events);
            } else {
                event_cb(&g, events[idx].data.fd, events[idx].events);
            }
        }
    }

    fprintf(MSG_OUT, "Exiting normally.\n");
    fflush(MSG_OUT);

    curl_multi_setopt(g.multi, CURLMOPT_SOCKETFUNCTION, end_sock_cb);
    while (g.concurrent_connections > 0 || g.transfers > 0)
    {
        int idx;
                int err = epoll_wait(g.epfd, events, sizeof(events)/sizeof(struct epoll_event), 10000);

                if (err == -1) {
                        if (errno == EINTR) {
                                fprintf(MSG_OUT, "note: wait interrupted\n");
                                continue;
                        } else {
                                perror("epoll_wait");
                                exit(1);
                        }
                }

                for (idx = 0; idx < err; ++idx) {
                        if (events[idx].data.fd == g.tfd) {
                                timer_cb(&g, events[idx].events);
                        } else {
                                event_cb(&g, events[idx].data.fd, events[idx].events);
                        }
                }

    }

    fprintf(MSG_OUT, "Finished all in progress downloads.\n");
    fflush(MSG_OUT);

    curl_multi_cleanup(g.multi);
    curl_global_cleanup();

    return (NULL);
}

int
main(int argc, char **argv)
{
    int cleanup = 0, opt, ret;

    should_exit = 0;
    signal(SIGINT, signal_handler);
    signal(SIGKILL, signal_handler);

    mysql_start();
    crawler_init();
    mysql_stop();


    printf("Exiting.\n");

    return (0);
}

典型输出为：

Parsed sites: 42, 6 parallel connections, 4 still running, 6 transfersssExiting normally.
Parsed sites: 48, 0 parallel connections, 0 still running, 0 transfers  Finished all in progress downloads.
Exiting.

如您所见，该应用并未达到硬编码的最大并行连接数。差远了。而且它只消耗大约 3 Mbps 的带宽。

为什么？我不明白为什么仅添加 MySQL 代码会导致程序无法运行发挥其全部潜力。无法达到硬编码的最大并行连接数。

知道这段代码有什么问题吗？

编辑

这是一些 gprof 输出：

Flat profile:

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ms/call  ms/call  name    
 44.49      0.04     0.04    12668     0.00     0.00  mysql_url_visited_find
 33.37      0.07     0.03    12668     0.00     0.00  mysql_url_frontier_push
 11.12      0.08     0.01    12668     0.00     0.00  mysql_url_frontier_find
 11.12      0.09     0.01      100     0.10     0.90  html_link_find
  0.00      0.09     0.00    17355     0.00     0.00  starts_with
  0.00      0.09     0.00    12669     0.00     0.00  url_sanitize
  0.00      0.09     0.00     2651     0.00     0.00  mcode_or_die
  0.00      0.09     0.00     2432     0.00     0.04  check_multi_info
  0.00      0.09     0.00     2420     0.00     0.04  event_cb
  0.00      0.09     0.00     1288     0.00     0.00  print_progress
  0.00      0.09     0.00      425     0.00     0.00  concurrent_connections_dec
  0.00      0.09     0.00      425     0.00     0.00  concurrent_connections_inc
  0.00      0.09     0.00      425     0.00     0.00  setsock
  0.00      0.09     0.00      303     0.00     0.00  remsock
  0.00      0.09     0.00      299     0.00     0.00  addsock
  0.00      0.09     0.00      219     0.00     0.00  transfers_dec
  0.00      0.09     0.00      219     0.00     0.00  transfers_inc
  0.00      0.09     0.00      116     0.00     0.00  mysql_url_frontier_pop
  0.00      0.09     0.00      116     0.00     0.00  new_head_conn
  0.00      0.09     0.00      103     0.00     0.00  new_body_conn
  0.00      0.09     0.00      100     0.00     0.90  html_parse
  0.00      0.09     0.00      100     0.00     0.00  html_title_find
  0.00      0.09     0.00      100     0.00     0.00  mysql_url_visited_push
  0.00      0.09     0.00      100     0.00     0.00  parsed_sites_inc
  0.00      0.09     0.00       32     0.00     0.01  timer_cb
  0.00      0.09     0.00        1     0.00    90.09  crawler_init
  0.00      0.09     0.00        1     0.00     0.00  mysql_start
  0.00      0.09     0.00        1     0.00     0.00  mysql_stop

Answer 1

您的所有 mysql 代码都是同步的 — 在您等待 mysql 响应查询或确认插入时，您的事件循环没有做任何事情，并且您花费了 90您在 mysql 相关功能中花费的时间百分比。您没有接近并发连接限制，因为数据库瓶颈限制了您发出新 HTTP 请求的速率，并且在您等待D B。

解决这个问题并不简单，但有一些一般性建议：

尽可能避免与数据库对话。 'frontier' 内容看起来像是可以保持在进程中的状态。
使用异步库与数据库交互并将其集成到您的事件循环中，这样您就可以在数据库查询未完成的情况下继续其他工作。
确保你的数据库有必要的索引，或者考虑使用像 redis 这样的东西而不是 mysql，其中访问模式更明确，简单的访问非常快，你可以'真的 "accidentally" 写了一个性能很差的查询。

为什么向基于 epoll 的网络爬虫添加 MySQL 代码会减慢它的速度？

Why would adding MySQL code to an epoll based web crawler slow it down so much?

c

mysql

curl

epoll