使用 mochiweb 解析器不计算图像大小的算法

Algorithm not counting the size of images using mochiweb parser

我正在努力使这个 https://ppolv.wordpress.com/2008/05/09/fun-with-mochiwebs-html-parser-and-xpath/ 算法起作用。所以一切都在编译并且工作得很好(我得到 html 页面的大小)但是:

我的预期:

大小 html

图片大小

脚本大小

我得到的是:

大小 html

无论如何图像大小都为零

脚本大小无论如何都为零

我试图找出错误或我遗漏了几个小时的东西,但我不知道哪里出了问题。 代码:

-module(test).
-author("Hubert").

%% API
-export([printing/4]).
-export([page_info/1]).
-export([got_page_info/3]).
-export([content_length/1]).
-export([spawn_workers/3]).
-export([get_info/2]).
-export([get_url_context/1]).
-export([wait_for_responses/2]).

%declaring record that will hold number of images, css and scripts
-record(state, {page,timer,errors,img,css,script}).

page_info(URL) ->
  inets:start(),
  case httpc:request(URL) of
    {ok,{_,Headers,Body}} ->
      got_page_info(URL,content_length(Headers),Body);
    {error,Reason} ->
      {error,Reason}
  end.

got_page_info(URLpassed, PageSize,Body) ->
  %getting the parsed version of website
  Tree = mochiweb_html:parse(Body),

  %particular files being listed and removing duplicates
  Imgs = rDup(mochiweb_xpath:execute("//img/@src",Tree)),
  %css does not work, do not know why
  %Css = rDup(mochiweb_xpath:execute("//link[@rel=’stylesheet’]/@href",Tree)),
  Scripts = rDup(mochiweb_xpath:execute("//script/@src",Tree)),

  %preapring URL
  URL = get_url_context(URLpassed),
      spawn_workers(URL,img,lists:map(fun  binary_to_list/1,Imgs)),
  spawn_workers(URL,script,lists:map(fun  binary_to_list/1,Scripts)),
  %Starts a timer which will send the message Msg to Dest after Time milliseconds.
  TRef = erlang:send_after(10000,self(),timeout),
  State = #state{page=PageSize,
    timer=TRef,
    errors=[],
    img=0,
    css=0,
    script=0},

  %number of elements -> so number of responses we should wait for
  wait_for_responses(State,length(Imgs)  + length(Scripts)),
  {ok}.

content_length(Headers) ->
  %proplists:get_value(Key,List,Default)
  %returns the length of the content
  list_to_integer(proplists:get_value("content-length",Headers,"0")).

%function that removes dulpicate
rDup(L) ->
  sets:to_list(sets:from_list(L)).

%spawn workers for every URl, who send back info about components -> getinfo
spawn_workers(URLctx,Type,URLs) ->
  lists:foreach(fun (Url) -> spawn( fun () ->
                                    self() ! {component, Type,Url,get_info(URLctx,Url)}
                                    end)
              end, URLs).

get_url_context(URL) ->
  {ok,{http,_,Root,_Port,Path,_Query}} = http_uri:parse(URL),
  Ctx = string:sub_string(Path,1, string:rstr(Path,"/")),
  {"http://"++Root,Ctx}. %% gib my url with context

get_info(URlctx,Url) ->
  FullURL = full_url(URlctx,Url),
  case httpc:request(head,{FullURL,[]},[],[]) of
    {ok, {_,Headers,_Body}} ->
      {ok,content_length(Headers)};
    {error,Reason} ->
      {error,Reason}
  end.


%FULL URL FUNCTIONS
%% abs url inside the same server ej: /img/image.png
full_url({Root,_Context},ComponentUrl=[$/|_]) ->
  Root ++ ComponentUrl;
%% full url ej: http://other.com/img.png
full_url({_Root,_Context},ComponentUrl="http://"++_) ->
  ComponentUrl;
% everything else is considerer a relative path.. obviously its wrong (../img)
full_url({Root,Context},ComponentUrl) ->
  Root ++ Context ++ "/" ++ ComponentUrl.

%collect infos recieved from wait_for_resposnses and add them to proper field of State
collect_info(State = #state{css=Css},css,_URL,{ok,Info}) ->
         State#state{css = Css + Info};
collect_info(State = #state{img=Img},img,_URL,{ok,Info}) ->
         State#state{img = Img + Info};
collect_info(State = #state{script=Script},script,_URL,{ok,Info}) ->
         State#state{script = Script + Info};
collect_info(State = #state{errors=Errors},_Type,URL,{error,Reason}) ->
         State#state{errors=[{URL,Reason}|Errors]}.

%messages from workers
wait_for_responses(State,0) ->
    finalize(State,0);

wait_for_responses(State,Counter) ->
    receive
      {component,Type,URL,Info} ->
          wait_for_responses(collect_info(State,Type,URL,Info),Counter - 1);
      timeout -> finalize(State,Counter)
    end.

%prepares variables for printing
 finalize(State,Left) ->
  PageSize =  State#state.page,
  ImgSize =  State#state.img,
  CssSize =  State#state.css, %maybe one day will work
  ScriptSize =  State#state.script,
  Errors =  State#state.errors,
  TRef =  State#state.timer,
  erlang:cancel_timer(TRef),
  printing(PageSize,ImgSize,CssSize,ScriptSize).

printing(PageSize,ImgSize,CssSize,ScriptSize)->
  io:format("html size: ~.2fkb~n",[PageSize/1024]),
  io:format("images size: ~.2fkb~n",[ImgSize/1024]),
  io:format("script size: ~.2fkb~n",[ScriptSize/1024]),
 % io:format("stylesheet size: ~.2fkb~n",[CssSize/1024]),
  {ok}.

问题出在函数中:

spawn_workers(URLctx,Type,URLs) ->
  lists:foreach(fun (Url) -> spawn( fun () ->
                                    self() ! {component, Type,Url,get_info(URLctx,Url)}
                                    end)
              end, URLs).

self() 在派生进程中进行评估,因此它将响应发送给自己。在生成进程之前将 self 分配给变量:

spawn_workers(URLctx,Type,URLs) ->
  Pid = self(),
  lists:foreach(fun (Url) -> spawn( fun () ->
                                    Pid ! {component, Type,Url,get_info(URLctx,Url)}
                                    end)
              end, URLs).

我将向您展示如何使用 Erlang 调试器 dbg 进行调试,而不是显示错误所在。使用这些命令启动它:

dbg:tracer(). #start the process
dbg:p(all, c). #match all calls in patterns given later
ShowReturnedResults = [{'_', [], [{return_trace}]}] #find this magic in the docs
dbg:tpl(test, get_info, '_', ShowReturnedResults).
test:page_info("http://www.lambdadays.org").

这将向您展示,为每张图片调用 get_info/2 并返回一些结果。 所以问题一定出在收集结果上,让我们检查一下 wait_for_responses/2:

dbg:stop_clear(). #clears all traces
dbg:tracer().
dbg:p(all, c).
ShowReturnedResults = [{'_', [], [{return_trace}]}] #find this magic in the docs
dbg:tpl(test, wait_for_responses, '_', ShowReturnedResults).
test:page_info("http://www.lambdadays.org").

糟糕。它只被调用一次。这意味着,它达到了超时。让我们看一下此调用期间发送的消息。因为 io:format 发送了很多消息,所以让我们在另一个进程中生成函数。

Pid = spawn(fun() -> test:page_info("http://www.lambdadays.org") end),
dbg:p(Pid, [sos, m]). #print all messages, sent and received by this process and processes, that it spawned.

您应该会收到很多消息,但我们只对返回的元组感兴趣:{component,img...},因此您可以找到类似这样的内容:

(<0.200.0>) <0.200.0> ! {component,img,
                               "/static/upload/media/1407924850920422agh.png",
                               {ok,189930}}
(<0.200.0>) << {component,img,"/static/upload/media/1407924850920422agh.png",
                      {ok,189930}}
(<0.199.0>) <0.199.0> ! {component,img,
                               "/static/upload/media/1407659467205755logo_glowna.png",
                               {ok,6424}}
(<0.199.0>) << {component,img,
                      "/static/upload/media/1407659467205755logo_glowna.png",
                      {ok,6424}}

您可以将其解读为:

  • 进程 0.200.0 将结果发送给进程 0.200.0
  • 进程 0.200.0 将此结果接收到其邮箱
  • 进程 0.199.0 向自己发送消息并接收它。

但是他们为什么要发给自己呢?

%spawn workers for every URl, who send back info about components -> getinfo
spawn_workers(URLctx,Type,URLs) ->
  lists:foreach(fun (Url) -> spawn( fun () ->
                                self() ! {component, Type,Url,get_info(URLctx,Url)}
                                end)
              end, URLs).

内部 fun 在新生成的进程的上下文中进行评估,因此该进程将消息发送给自己,而不是父进程。您必须在上下文中评估 self() o parent 并将其传递给变量。

%spawn workers for every URl, who send back info about components -> getinfo
spawn_workers(URLctx,Type,URLs) ->
  Parent = self(),
  lists:foreach(fun (Url) -> spawn( fun () ->
                                Parent ! {component, Type,Url,get_info(URLctx,Url)}
                                end)
              end, URLs).

dbg documentation