Author: ianb
Date: Tue Jul 17 19:36:28 2007
New Revision: 45165
Modified:
lxml/branch/html/doc/lxmlhtml.txt
Log:
Added an example doing microformat parsing
Modified: lxml/branch/html/doc/lxmlhtml.txt
==============================================================================
--- lxml/branch/html/doc/lxmlhtml.txt (original)
+++ lxml/branch/html/doc/lxmlhtml.txt Tue Jul 17 19:36:28 2007
@@ -66,6 +66,10 @@
Returns the text content of the element, including the text
content of its children, with no markup.
+``.cssselect(expr)``:
+ Select elements from this element and its children, using a CSS
+ selector expression. (Note that ``.xpath(expr)`` is also
+ available as on all lxml elements.)
Running HTML doctests
=====================
@@ -393,3 +397,59 @@
``word_break_html(html)`` parses the HTML document and returns a
string.
+
+Examples
+========
+
+Microformat Example
+-------------------
+
+This example parses the `hCard <http://microformats.org/wiki/hcard>`_
+microformat.
+
+First we get the page::
+
+ >>> import urllib
+ >>> from lxml.html import HTML
+ >>> url = 'http://microformats.org/'
+ >>> content = urllib.urlopen(url).read()
+ >>> doc = HTML(content)
+ >>> doc.make_links_absolute(url)
+
+Then we create some objects to put the information in:
+
+ >>> class Card(object):
+ ... def __init__(self, **kw):
+ ... for name, value in kw:
+ ... setattr(self, name, value)
+ >>> class Phone(object):
+ ... def __init__(self, phone, types=()):
+ ... self.phone, self.types = phone, types
+
+And some generally handy functions for microformats:
+
+ >>> def get_text(el, class_name):
+ ... els = el.find_class(class_name)
+ ... if els:
+ ... return els[0].text_content()
+ ... else:
+ ... return ''
+ >>> def get_value(el):
+ ... return get_text(el, 'value') or el.text_content()
+ >>> def get_all_texts(el, class_name):
+ ... return [e.text_content() for e in els.find_class(class_name)]
+ >>> def parse_addresses(el):
+ ... # Ideally this would parse street, etc.
+ ... return el.find_class('adr')
+
+Then the parsing:
+
+ >>> for el in doc.find_class('hcard'):
+ ... card = Card()
+ ... card.el = el
+ ... card.fn = get_text(el, 'fn')
+ ... card.tels = []
+ ... for tel_el in card.find_class('tel'):
+ ... card.tels.append(Phone(get_value(tel_el),
+ ... get_all_texts(tel_el, 'type')))
+ ... card.addresses = parse_addresses(el)
|