* Web page scraping packages
@ 2006-08-01 0:06 Joel Reymont
2006-08-01 0:42 ` [Caml-list] " Karl Zilles
2006-08-01 9:10 ` Richard Jones
0 siblings, 2 replies; 3+ messages in thread
From: Joel Reymont @ 2006-08-01 0:06 UTC (permalink / raw)
To: caml-list
Folks,
Are there any screen-scraping packages for OCaml?
I'm looking for something that would let me analyze the contents of a
web page and extract, for example, all the image tags.
I'm using Ruby for this at work and something like hpricot [1] is
very neat but also somewhat slow.
Thanks, Joel
[1] http://code.whytheluckystiff.net/hpricot/
--
http://wagerlabs.com/
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [Caml-list] Web page scraping packages
2006-08-01 0:06 Web page scraping packages Joel Reymont
@ 2006-08-01 0:42 ` Karl Zilles
2006-08-01 9:10 ` Richard Jones
1 sibling, 0 replies; 3+ messages in thread
From: Karl Zilles @ 2006-08-01 0:42 UTC (permalink / raw)
To: Joel Reymont; +Cc: caml-list
[-- Attachment #1: Type: text/plain, Size: 1297 bytes --]
Joel Reymont wrote:
> Are there any screen-scraping packages for OCaml?
>
> I'm looking for something that would let me analyze the contents of a
> web page and extract, for example, all the image tags.
I don't think of this as screen scraping. Spidering might be a better word.
I've done a good bit of this in OCaml. I use the curl package for
downloading web pages and the netstring package for parsing them.
I'm going to attach a couple of files that I use for this sort of stuff.
The file htmltreeutils.ml has a bunch of functions for working with
the results of a nethtml parse tree.
So your program would look something like this.. and this hasn't been
tested:
open Htmltreeutils
let result = Buffer.create 2000 in
let connection = Curl.init () in
Curl.set_httpget connection true;
Curl.set_url connection "http://www.yahoo.com/randompage.html";
Curl.set_writefunction connection (fun s -> Buffer.add_string
result s);
Curl.set_headerfunction connection (fun s -> ());
Curl.perform connection;
Curl.cleanup connection;
let dom = get_parsed_html_from_string result in
let img_tags = list_tags "img" dom in
.... do something with img tags here like pull out their src
attributes
Here are the two helper files:
[-- Attachment #2: htmltreeutils.ml --]
[-- Type: text/plain, Size: 9632 bytes --]
(** Functions to generate and search parsed html pages.
Uses the Nethtml module to do the heAvy lifting. *)
(** Author: Karl Zilles, released into public domain *)
open Nethtml
open Pcre
open Printf
open Utility
(** {1:Fix questionable "form" decision in dtd} *)
let fix_dtd dtd =
List.map (function
| (name, (elclass, _)) when name = "form" ->
(name, (elclass, `Sub_exclusions( ["form"], `Flow )))
| line -> line
) dtd
let my_dtd = fix_dtd relaxed_html40_dtd
(** {1:create Create an html tree} *)
let get_parsed_html_from_channel inchannel =
let parsed = List.hd (parse ~dtd:my_dtd inchannel) in
inchannel#close_in ();
parsed
(** [ get_parsed_html file ] returns an html tree of the contents of file.
See the Nethtml documentation for the format *)
let get_parsed_html file =
let inchannel = new Netchannels.input_channel (open_in file) in
get_parsed_html_from_channel inchannel
(** [ get_parsed_html file ] returns an html tree of the contents of file.
See the Nethtml documentation for the format *)
let get_parsed_html_from_string str =
let inchannel = new Netchannels.input_string str in
get_parsed_html_from_channel inchannel
(** {1:inspect Inspecting a tag} *)
(** [ match_tag tag doc] returns true if the current tag has type "tag".
Example: [match_tag "td" doc] will return true if doc is a "td" tag. *)
let match_tag thistag = function
| Element (tag,_,_) when tag = thistag -> true
| _ -> false
let tag_type = function
| Element (tag,_,_) -> tag
| _ -> raise Not_found
(** [ attribute doc attr ] returns the value of the attribute attr if it
exists in the current tag, or throws a [ Not_found ] exception if it doesn't *)
let attribute doc name = match doc with
| Element (_,attributes,_) ->
List.assoc name attributes
| _ -> raise Not_found
let as_text, as_text_list =
let rec as_text' = function
| Element (_,_,subdocs) -> as_text_list' subdocs
| Data text -> replace ~pat:"(\\r|\\n)" text
and as_text_list' l =
replace ~pat:"\\s+" ~templ:" "
(List.fold_left (fun current doc -> current ^
(as_text' doc)) "" l) in
(compose strip as_text', compose strip as_text_list')
(** [ as_text doc] returns a string of the text contents of this tag and
all of it's subtags*)
let as_text = as_text
(** [ as_text_list doc] returns a string of the text contents of all the tags
in this list and all of their subtags*)
let as_text_list = as_text_list
let rec as_text_formatted = function
| Element (_,_,subdocs) -> as_text_list_formatted subdocs
| Data text -> text
and as_text_list_formatted l =
(List.fold_left (fun current doc -> current ^
(as_text_formatted doc)) "" l)
(** [ as_html doc] returns a string of the html contents of all the tags
in this list and all of their subtags*)
let as_html_list documentlist =
let result = Buffer.create 2000 in
let outbuffer = new Netchannels.output_buffer result in
write ~dtd:my_dtd outbuffer documentlist;
Buffer.contents result
let as_html document = as_html_list [document]
(** {1:search Search or process an html tree} *)
(** [ iter_document f doc ] runs the function f on every tag in the document *)
let rec iter_document f doc =
f doc;
match doc with
| Element (_,_,subdocs) ->
List.iter (iter_document f) subdocs
| _ -> ()
let fold_left f = fold_left_from_iterator iter_document f
(** [ iter_document f doc ] runs the function f on every tag in the document.
In addition to passing the current element to f, it also passes a list
of all the parent tags of that element. *)
let iter_document_with_parents f doc =
let rec iter_document_with_parents' f parents doc =
f doc parents;
match doc with
| Element (_,_,subdocs) ->
List.iter (iter_document_with_parents' f (doc::parents))
subdocs
| _ -> ()
in
iter_document_with_parents' f [] doc
(** [ list_tags tagname doc ] returns a list of all tags of a certain type
in the html tree *)
let list_tags tagname doc =
let tags = ref [] in
iter_document (fun doc ->
if match_tag tagname doc then
tags := doc :: !tags) doc;
List.rev !tags
(** [ list_tags_with_parents tagname doc ] returns a list of all tags of a
certain type in the html tree, and also includes a list of their parents
with each one. *)
let list_tags_with_parents tagname doc =
let tags = ref [] in
iter_document_with_parents (fun doc parents ->
if match_tag tagname doc then
tags := (doc,parents) :: !tags) doc;
List.rev !tags
(** [ find filter doc ] returns the first element that filter returns
true on, searching the document in a depth first search. Raises a [ Not_found ]
exception if nothing is matched. *)
let find filter document =
let rec find' filter document =
if filter document then
Some document
else
match document with
| Element (_,_,subdocs) ->
let rec loop = (function
| h::t ->
(match find' filter h with
| Some x as result -> result
| None -> loop t)
| [] -> None) in
loop subdocs
| _ -> None in
match find' filter document with
| Some x -> x
| None -> raise Not_found
(** [ find filter doc ] returns the first element that filter returns
true on, searching the document in a depth first search. Note that in
this function we also pass along a list of parents to the filter, and
return the list of parents with the matching object. Raises a [ Not_found ]
exception if nothing is matched. *)
let find_with_parent filter document =
let rec find' filter parents document =
if filter parents document then
Some (document,parents)
else
match document with
| Element (_,_,subdocs) ->
let add_parents = document :: parents in
let rec loop = (function
| h::t ->
(match find' filter add_parents h with
| Some x as result -> result
| None -> loop t)
| [] -> None) in
loop subdocs
| _ -> None in
match find' filter [] document with
| Some x -> x
| None -> raise Not_found
(** [ parse_tags_at_same_level tag doc] looks down from the passed document
and finds the first matching tag, then returns a list of
matching tags with the same parent as the first match. *)
let parse_tags_at_same_level tag document =
let is_tag = match_tag tag in
let first_tr, parents = find_with_parent (function _ -> is_tag)
document in
let siblings = match List.hd parents with
Element (_,_,x) -> x | _ -> assert false in
List.filter is_tag siblings
(** [parse_trs doc] calls parse_tags_at_same_level matching "tr" tags *)
let parse_trs = parse_tags_at_same_level "tr"
(** [parse_tds doc] calls parse_tags_at_same_level matching "td" tags *)
let parse_tds = parse_tags_at_same_level "td"
let get_tag_with_name tag name document =
find (
fun el ->
try match_tag tag el && (attribute el "name")=name
with Not_found -> false
) document
(* form handling *)
let decode = Netencoding.Html.decode_to_latin1
let get_form = get_tag_with_name "form"
let get_select = get_tag_with_name "select"
let select_selection selection =
try
(* look for a "selected" value *)
find (fun el ->
try ignore (attribute el "selected"); true
with Not_found-> false) selection
with Not_found ->
(* otherwise, take first option element *)
find (match_tag "option") selection
let select_value selection =
decode (attribute (select_selection selection) "value")
let select_label selection =
decode (as_text (select_selection selection))
let get_select_value form name =
select_value (get_select name form)
let get_select_label form name =
select_label (get_select name form)
let input_value input =
decode (attribute input "value")
let get_input_value form name =
input_value (get_tag_with_name "input" name form)
let textarea_value textarea =
decode (as_text_formatted textarea)
let get_textarea_value form name =
textarea_value (get_tag_with_name "textarea" name form)
let element_name default el = try attribute el "name"
with Not_found -> default
let list_forms document =
list_tags "form" document
let form_names document =
List.map (element_name "[No name]") (list_forms document)
let element_to_accessor = [
"select", select_value;
"input", input_value;
"textarea", textarea_value;
]
let get_field_list form =
List.rev (fold_left (fun current el ->
try
let accessor = List.assoc (tag_type el) element_to_accessor in
(* printf "%s: %s \n" (tag_type el) (element_name "noname" el); *)
( element_name "noname" el, accessor el ) :: current
with Not_found -> current
) [] form)
let print_field_list a =
printf "[\n";
List.iter (function name,value ->
printf " \"%s\",\"%s\";\n" (String.escaped name) (String.escaped value)
) a;
printf "]\n"
[-- Attachment #3: utility.ml --]
[-- Type: text/plain, Size: 5522 bytes --]
(** Utility functions for general use. Most of these deal with the
default data structures, but some require the pcre library, and some
require the Unix module *)
(** Author: Karl Zilles, released into public domain *)
open Pcre
open Printf
(** {1:generic Generic functions:} *)
(** [ compose f g ] returns a new function that is like
running g on the inputs and then f on the results of g *)
let compose f g = fun x -> f (g x)
(** [ fold_left_from_iterator iter ] returns a fold_left function on
the same datastructure that your iter function works on. You may have
to use it as so:
let fold_left f = fold_left_from_iterator iter_document f
to avoid "cannot be generalized" errors
*)
let fold_left_from_iterator iter =
(fun f init data ->
let cur = ref init in
iter (fun el -> cur := f !cur el) data;
!cur)
(** {1:hash Hash table functions:} *)
(** Returns the unique list of keys in a Hashtable *)
let hash_keys h =
Hashtbl.fold (fun key _ l ->
if l = [] || (List.hd l) <> key then key :: l else l
) h []
(** Returns the unique list of values in a Hashtable *)
let hash_values h =
Hashtbl.fold (fun _ value l ->
if not (List.mem value l) then value :: l else l) h []
(** Get a value or fail with error *)
let get_value_or_fail hash key error =
try Hashtbl.find hash key
with Not_found -> raise (Failure error)
(** {1:list List functions:} *)
(** [ list_iteri f l ] runs the function f on every element of l, passing
the 0-based index of the element, and the element itself *)
let list_iteri f l =
let rec list_iteri' i = function
| [] -> ()
| h::t -> f i h; list_iteri' (i+1) t
in
list_iteri' 0 l
(** [ list_mapi f l ] returns a list of the results of runing the function f on every element of l, passing the 0-based index of the element, and the element itself *)
let list_mapi f l =
let rec list_mapi' i = function
| [] -> []
| h::t -> (f i h)::(list_mapi' (i+1) t)
in
list_mapi' 0 l
(** [ list_skip n l ] returns the list l with the first n elements removed, or
the empty list if it runs out of elements to skip *)
let rec list_skip n l =
if n = 0 || l = [] then l else list_skip (n-1) (List.tl l)
(** [ list_first n l ] returns the first n element of list l, or as many as it
can find *)
let rec list_first n l =
if n = 0 then [] else
match l with
| [] -> []
| h::t -> h::list_first (n-1) t
(** [ assoc_merge_with_replace first second ] returns a merge of two association
* lists with any duplicate keys using the values from the second list. *)
let assoc_merge_with_replace first second =
List.fold_left (fun cur entry ->
if not (List.mem_assoc (fst entry) second) then entry::cur else cur)
second (List.rev first)
(** {1:string String functions:} *)
let list_of_string s =
let rec chars s index so_far =
if index < 0 then so_far
else chars s (index-1) (s.[index] :: so_far)
in
chars s ((String.length s)-1) []
(** [ string_ends_with contents end ] returns true if the last characters
* of the string contents are the string end *)
let string_ends_with s e =
let len_s = String.length s in
let len_e = String.length e in
if len_s >= len_e then
(String.sub s (len_s - len_e) len_e) = e
else false
(** {1:system System commands:} *)
(** [ command_to_string_list command ] runs command as an external process
and then copies the stdout of the results into a list of strings. Stderr
goes to the ocaml stderr *)
let command_to_string_list command =
let input = Unix.open_process_in command in
let results = ref [] in
(try
while true do
results := (input_line input) :: !results
done
with
End_of_file -> ());
List.rev !results
(** [ quiet_mkdir dir permission ] runs the standard Unix.mkdir, but
does not throw an exception if the directory already exists *)
let quiet_mkdir dir permission =
try Unix.mkdir dir permission
with Unix.Unix_error (Unix.EEXIST, _, _) -> ()
(** {1:pcre Pcre tools:} *)
(** Returns the string with all leading and trailing spaces removed
including \160 which is some weird space like character that excel
seems to like *)
let strip s = replace ~pat:"(^(\\s|\160| )+|(\\s|\160| )+$)" s
(** A pregenerated option list for doing caseless matches in Pcre *)
let caseless = cflags [`CASELESS]
(** {1:config Configuration file tools:} *)
let parse_config_file config_file =
let results = Hashtbl.create 10 in
foreach_file [config_file] (fun _ input ->
foreach_line ~ic:input (fun line ->
try
let m = extract ~pat:"(.*)=(.*)" line in
Hashtbl.replace results (strip m.(1)) (strip m.(2));
with Not_found ->
if strip line <> "" then
eprintf "Unable to parse configuration file line:\n%s\n"
line;
));
results
(** {1:File tools:} *)
let file_to_string file =
let ic = open_in file in
let len = in_channel_length ic in
let result = String.create len in
let rec readdata start =
let read = input ic result start (len-start) in
if read = 0 then start
else readdata (start+read) in
let real_length = readdata 0 in
close_in ic;
String.sub result 0 real_length
let string_to_file file data =
let oc = open_out file in
output_string oc data;
close_out oc
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [Caml-list] Web page scraping packages
2006-08-01 0:06 Web page scraping packages Joel Reymont
2006-08-01 0:42 ` [Caml-list] " Karl Zilles
@ 2006-08-01 9:10 ` Richard Jones
1 sibling, 0 replies; 3+ messages in thread
From: Richard Jones @ 2006-08-01 9:10 UTC (permalink / raw)
To: Joel Reymont; +Cc: caml-list
On Tue, Aug 01, 2006 at 01:06:52AM +0100, Joel Reymont wrote:
> Are there any screen-scraping packages for OCaml?
>
> I'm looking for something that would let me analyze the contents of a
> web page and extract, for example, all the image tags.
We did some web scraping using WWW::Mechanize + perl4caml. As a
result, perl4caml contains pretty complete bindings for the
WWW::Mechanize library.
http://merjis.com/developers/perl4caml
http://resources.merjis.com/developers/perl4caml/Pl_WWW_Mechanize.www_mechanize.html
Rich.
--
Richard Jones, CTO Merjis Ltd.
Merjis - web marketing and technology - http://merjis.com
Team Notepad - intranets and extranets for business - http://team-notepad.com
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2006-08-01 9:11 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-08-01 0:06 Web page scraping packages Joel Reymont
2006-08-01 0:42 ` [Caml-list] " Karl Zilles
2006-08-01 9:10 ` Richard Jones
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox