Berlin Brown and Software Development: Makings of a simple web scraper in Erlang

Monday, February 18, 2008

Makings of a simple web scraper in Erlang

This code parse a web page and tokenizes the content. The code uses Joe's www_tools library and I was trying to get the rfc4627 code to parse unicode documents. That particular code is a work in progress. Ultimately, I would like to be able to use this code to crawl FOAF documents.

Simple Driver Code (uses url.erl and disk_cache).


%%
%% Simple Statistic Analysis of social networking sites
%% Author: Berlin Brown
%% Date: 2/12/2008
%%

-module(socialstats).

-export([start_social/0]).

-import(url, [test/0, raw_get_url/2, start_cache/1, stop_cache/0]).
-import(rfc4627, [unicode_decode/1]).
-import(html_analyze, [disk_cache_analyze/1]).

-define(SocialURL, "http://botnode.com/").

start_social() ->
    io:format("*** Running social statistics~n"),
    %% First, setup the URL disk cache
    url:start_cache("db_cache/socialstats.dc"),
    case url:raw_get_url(?SocialURL, 60000) of
        {ok, Data} ->
            io:format("Data found from URL, storing=~s~n", [?SocialURL]),
            disk_cache:store(?SocialURL, Data),
            %% val = list_to_binary(xmerl_ucs:from_utf8([Data])),
            %% val = rfc4627:unicode_decode(Data),
            {ok, Data};
        {error, What} ->
            io:format("ERR:~p ~n", [What]),
            {error, What}
        end,
    %% Analyze the disk cache
    case disk_cache:fetch(?SocialURL) of
        {ok, Bin} ->
            io:format("Data found from disk cache, fetching=~s~n", [?SocialURL]),
            Toks = html_tokenise:disk_cache2toks(?SocialURL),
            io:format("Data found from disk cache, fetching=~p~n", [Toks]),
            {ok, Bin};
        {error, Err} ->
            io:format("ERR:~p ~n", [Err]),
            {error, Err}
        end,
    %% Stop the disk cach
    url:stop_cache(),
    io:format("*** Done [!]~n").

%% End of File

This in turn, brings up Joe Armstrong's tcp/ip code for retrieving the document.


raw_get_url(URL, Timeout) -> 
    case url_parse:parse(URL) of
    {error, Why} ->
        {error, {badURL,URL}};
    {http, HostName, Port, File} ->
        get_http(HostName, Port, File, ["Host: ", HostName], Timeout);
    {file, Location} ->
        get_file(Location)
    end.

raw_get_url(URL, Timeout, {IP, Port}) ->
    get_http(IP, Port, URL, [], Timeout).

get_file(Location) ->
    file:read_file(Location).

get_http(IP, Port, URL, Opts, Timeout) ->
    %% io:format("ip = ~p, port = ~p, url = ~p~n", [ IP, Port, URL ]),
    Cmd = ["GET ", URL, " HTTP/1.1\r\n", Opts, "\r\n\r\n"],
    io:format("Cmd=~p\n", [Cmd]),
    io:format("url_server: fetching ~p ~p ~p~n", [IP, Port, URL]),
    case catch
      gen_tcp:connect(IP, Port,
       [binary, {packet, raw}, {nodelay, true}, {active, true}]) of
    {'EXIT', Why} -> 
        %% io:format("Socket exit:~p~n", [Why]),
        {error, {socket_exit, Why}};
    {error, Why} -> 
        %% io:format("Socket error:~p~n", [Why]),
        {error, {socket_error, Why}};
    {ok, Socket} ->
        %% io:format("Socket = ~p~n", [Socket]),
        gen_tcp:send(Socket, Cmd),
        receive_data(Socket, Timeout, list_to_binary([]))
    end.

receive_data(Socket, Timeout, Bin) ->
    receive
    {tcp, Socket, B} ->
        %io:format(".", []),
        receive_data(Socket, Timeout, concat_binary([Bin,B]));
    {tcp_closed, Socket} ->
        Data0 = binary_to_list(Bin),
        %% io:fwrite("Socket closed: ~p~n", [Data0]),
        {Data1, Info} = get_header(Data0, []),
        Bin1 = list_to_binary(Data1),
        {ok, Bin1};
    Other ->
            %% io:fwrite("Other: ~p~n", [Other]),
        {error, {socket, Other}}
    after
        Timeout ->
        {error, timeout}
    end.

Berlin Brown and Software Development

Monday, February 18, 2008

Makings of a simple web scraper in Erlang

No comments:

Search This Blog