Monday, February 18, 2008

Makings of a simple web scraper in Erlang

This code parse a web page and tokenizes the content. The code uses Joe's www_tools library and I was trying to get the rfc4627 code to parse unicode documents. That particular code is a work in progress. Ultimately, I would like to be able to use this code to crawl FOAF documents.

Simple Driver Code (uses url.erl and disk_cache).

%%
%% Simple Statistic Analysis of social networking sites
%% Author: Berlin Brown
%% Date: 2/12/2008
%%

-module(socialstats).

-export([start_social/0]).

-import(url, [test/0, raw_get_url/2, start_cache/1, stop_cache/0]).
-import(rfc4627, [unicode_decode/1]).
-import(html_analyze, [disk_cache_analyze/1]).

-define(SocialURL, "http://botnode.com/").

start_social() ->
io:format("*** Running social statistics~n"),
%% First, setup the URL disk cache
url:start_cache("db_cache/socialstats.dc"),
case url:raw_get_url(?SocialURL, 60000) of
{ok, Data} ->
io:format("Data found from URL, storing=~s~n", [?SocialURL]),
disk_cache:store(?SocialURL, Data),
%% val = list_to_binary(xmerl_ucs:from_utf8([Data])),
%% val = rfc4627:unicode_decode(Data),
{ok, Data};
{error, What} ->
io:format("ERR:~p ~n", [What]),
{error, What}
end,
%% Analyze the disk cache
case disk_cache:fetch(?SocialURL) of
{ok, Bin} ->
io:format("Data found from disk cache, fetching=~s~n", [?SocialURL]),
Toks = html_tokenise:disk_cache2toks(?SocialURL),
io:format("Data found from disk cache, fetching=~p~n", [Toks]),
{ok, Bin};
{error, Err} ->
io:format("ERR:~p ~n", [Err]),
{error, Err}
end,
%% Stop the disk cach
url:stop_cache(),
io:format("*** Done [!]~n").

%% End of File


This in turn, brings up Joe Armstrong's tcp/ip code for retrieving the document.


raw_get_url(URL, Timeout) ->
case url_parse:parse(URL) of
{error, Why} ->
{error, {badURL,URL}};
{http, HostName, Port, File} ->
get_http(HostName, Port, File, ["Host: ", HostName], Timeout);
{file, Location} ->
get_file(Location)
end.

raw_get_url(URL, Timeout, {IP, Port}) ->
get_http(IP, Port, URL, [], Timeout).

get_file(Location) ->
file:read_file(Location).

get_http(IP, Port, URL, Opts, Timeout) ->
%% io:format("ip = ~p, port = ~p, url = ~p~n", [ IP, Port, URL ]),
Cmd = ["GET ", URL, " HTTP/1.1\r\n", Opts, "\r\n\r\n"],
io:format("Cmd=~p\n", [Cmd]),
io:format("url_server: fetching ~p ~p ~p~n", [IP, Port, URL]),
case catch
gen_tcp:connect(IP, Port,
[binary, {packet, raw}, {nodelay, true}, {active, true}]) of
{'EXIT', Why} ->
%% io:format("Socket exit:~p~n", [Why]),
{error, {socket_exit, Why}};
{error, Why} ->
%% io:format("Socket error:~p~n", [Why]),
{error, {socket_error, Why}};
{ok, Socket} ->
%% io:format("Socket = ~p~n", [Socket]),
gen_tcp:send(Socket, Cmd),
receive_data(Socket, Timeout, list_to_binary([]))
end.

receive_data(Socket, Timeout, Bin) ->
receive
{tcp, Socket, B} ->
%io:format(".", []),
receive_data(Socket, Timeout, concat_binary([Bin,B]));
{tcp_closed, Socket} ->
Data0 = binary_to_list(Bin),
%% io:fwrite("Socket closed: ~p~n", [Data0]),
{Data1, Info} = get_header(Data0, []),
Bin1 = list_to_binary(Data1),
{ok, Bin1};
Other ->
%% io:fwrite("Other: ~p~n", [Other]),
{error, {socket, Other}}
after
Timeout ->
{error, timeout}
end.

No comments: