Author: scoder
Date: Thu Apr 20 18:24:25 2006
New Revision: 26055
Modified:
lxml/branch/resolver-new/src/lxml/etree.pyx
lxml/branch/resolver-new/src/lxml/parser.pxi
lxml/branch/resolver-new/src/lxml/tests/test_etree.py
lxml/branch/resolver-new/src/lxml/xmlparser.pxd
lxml/branch/resolver-new/src/lxml/xslt.pxi
Log:
initial working implementation of parser entity resolvers
Modified: lxml/branch/resolver-new/src/lxml/etree.pyx
==============================================================================
--- lxml/branch/resolver-new/src/lxml/etree.pyx (original)
+++ lxml/branch/resolver-new/src/lxml/etree.pyx Thu Apr 20 18:24:25 2006
@@ -1281,6 +1281,19 @@
return ElementTree(doc.getroot())
+# class for temporary storage of Python references
+cdef class _TempStore:
+ cdef object _storage
+ def __init__(self):
+ self._storage = {}
+
+ cdef void add(self, obj):
+ python.PyDict_SetItem(self._storage, id(obj), obj)
+
+ cdef void clear(self):
+ python.PyDict_Clear(self._storage)
+
+
# include submodules
include "xmlerror.pxi" # error and log handling
include "xmlid.pxi" # XMLID and IDDict
Modified: lxml/branch/resolver-new/src/lxml/parser.pxi
==============================================================================
--- lxml/branch/resolver-new/src/lxml/parser.pxi (original)
+++ lxml/branch/resolver-new/src/lxml/parser.pxi Thu Apr 20 18:24:25 2006
@@ -58,15 +58,156 @@
__GLOBAL_PARSER_CONTEXT = _ParserContext()
+############################################################
+## Custom resolver API
+############################################################
+
+cdef class _ResolverRegistry # forward declaration
+
+cdef class _ParserInput:
+ cdef xmlparser.xmlParserInput* _input
+ cdef object _pyref # to keep Python references
+
+cdef class _ResolverContext:
+ cdef xmlparser.xmlParserCtxt* _ctxt
+ cdef _ResolverRegistry _resolvers
+ cdef _TempStore _storage
+
+cdef class Resolver:
+ def resolve(self, system_url, public_id, _ResolverContext context not
None):
+ cdef _ParserInput parser_input
+ cdef char* c_url
+ cdef char* c_id
+ if __DEFAULT_ENTITY_LOADER is NULL:
+ return None
+ if system_url is None:
+ c_url = NULL
+ else:
+ url_utf = _utf8(system_url)
+ c_url = _cstr(url_utf)
+ if public_id is None:
+ c_id = NULL
+ else:
+ id_utf = _utf8(public_id)
+ c_id = _cstr(id_utf)
+ parser_input = _ParserInput()
+ parser_input._input = __DEFAULT_ENTITY_LOADER(
+ c_url, c_id, context._ctxt)
+ return parser_input
+
+ def resolve_string(self, string, _ResolverContext context not None):
+ cdef _ParserInput parser_input
+ string_utf = _utf8(string)
+ parser_input = _ParserInput()
+ parser_input._input = xmlparser.xmlNewStringInputStream(
+ context._ctxt, _cstr(string_utf))
+ parser_input._pyref = string_utf
+ return parser_input
+
+ def resolve_filename(self, filename, _ResolverContext context not None):
+ cdef _ParserInput parser_input
+ filename_utf = _utf8(filename)
+ parser_input = _ParserInput()
+ parser_input._input = xmlparser.xmlNewInputFromFile(
+ context._ctxt, _cstr(filename_utf))
+ return parser_input
+
+cdef class _ResolverRegistry:
+ cdef object _resolvers
+ cdef Resolver _default_resolver
+ def __init__(self, Resolver default_resolver=None):
+ try:
+ self._resolvers = set()
+ except NameError:
+ from sets import Set
+ self._resolvers = Set()
+ if default_resolver is None:
+ self._default_resolver = Resolver()
+ else:
+ self._default_resolver = default_resolver
+
+ def add(self, Resolver resolver not None):
+ """Register a resolver.
+
+ For each requested entity, the 'resolve' method of the resolver will
+ be called and the result will be passed to the parser. If this method
+ returns None, the request will be delegated to other resolvers or the
+ default resolver. The resolvers will be tested in an arbitrary order
+ until the first match is found.
+ """
+ self._resolvers.add(resolver)
+
+ def remove(self, resolver):
+ self._resolvers.discard(resolver)
+
+ def resolve(self, system_url, public_id, _ResolverContext context not
None):
+ for resolver in self._resolvers:
+ result = resolver.resolve(system_url, public_id, context)
+ if result is not None:
+ return result
+ return self._default_resolver.resolve(system_url, public_id, context)
+
+cdef xmlparser.xmlParserInput* _local_resolver(char* c_url, char* c_pubid,
+ xmlParserCtxt* c_context):
+ cdef _ResolverContext context
+ cdef _ParserInput parser_input
+ if c_context._private is NULL:
+ return __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
+
+ if c_url is NULL:
+ url = None
+ else:
+ url = funicode(c_url)
+ if c_pubid is NULL:
+ pubid = None
+ else:
+ pubid = funicode(c_pubid)
+
+ context = <_ResolverContext>c_context._private
+ try:
+ parser_input = context._resolvers.resolve(url, pubid, context)
+ except Exception, e:
+ print e
+ if parser_input is None:
+ return NULL
+ context._storage.add(parser_input)
+ return parser_input._input
+
+cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
+__DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
+
+xmlparser.xmlSetExternalEntityLoader(_local_resolver)
+
+############################################################
+## Parsers
+############################################################
+
cdef class BaseParser:
cdef _ErrorLog _error_log
+ cdef readonly object resolvers
+ cdef _ResolverContext _context
def __init__(self):
self._error_log = _ErrorLog()
+ self.resolvers = _ResolverRegistry()
+ self._context = None
property error_log:
def __get__(self):
return self._error_log.copy()
+ cdef _initContext(self, xmlParserCtxt* c_ctxt):
+ cdef _ResolverContext context
+ __GLOBAL_PARSER_CONTEXT._initParserDict(c_ctxt)
+ context = _ResolverContext()
+ context._ctxt = c_ctxt
+ context._resolvers = self.resolvers
+ context._storage = _TempStore()
+ self._context = context
+ c_ctxt._private = <python.PyObject*>context
+
+ cdef _clearContext(self):
+ self._context = None
+
cdef xmlDoc* _handleResult(self, xmlParserCtxt* ctxt,
xmlDoc* result) except NULL:
if ctxt.wellFormed:
@@ -80,7 +221,6 @@
raise XMLSyntaxError
return result
-
############################################################
## XML parser
############################################################
@@ -154,10 +294,10 @@
if pctxt is NULL:
pctxt = self._createContext()
self._memory_parser_ctxt = pctxt
-
- __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt)
+ self._initContext(pctxt)
result = xmlparser.xmlCtxtReadDoc(
pctxt, _cstr(text_utf), NULL, NULL, self._parse_options)
+ self._clearContext()
return self._handleResult(pctxt, result)
cdef xmlDoc* _parseDocFromFile(self, char* filename) except NULL:
@@ -168,10 +308,10 @@
if pctxt is NULL:
pctxt = self._createContext()
self._file_parser_ctxt = pctxt
-
- __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt)
+ self._initContext(pctxt)
result = xmlparser.xmlCtxtReadFile(
pctxt, filename, NULL, self._parse_options)
+ self._clearContext()
if result is NULL:
if pctxt.lastError.domain == xmlerror.XML_FROM_IO:
self._error_log.disconnect()
@@ -203,6 +343,8 @@
else:
raise TypeError, "Invalid parser"
+def get_default_parser():
+ return __DEFAULT_PARSER
############################################################
## HTML parser
@@ -264,9 +406,10 @@
self._error_log.disconnect()
raise ParserError, "Failed to create parser context"
self._memory_parser_ctxt = pctxt
- __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt)
+ self._initContext(pctxt)
result = htmlparser.htmlCtxtReadDoc(
pctxt, c_text, NULL, NULL, self._parse_options)
+ self._clearContext()
return self._handleResult(pctxt, result)
cdef xmlDoc* _parseDocFromFile(self, char* filename) except NULL:
@@ -284,9 +427,10 @@
raise IOError, "Could not open file %s" % filename
raise ParserError, "Failed to create parser context"
self._file_parser_ctxt = pctxt
- __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt)
+ self._initContext(pctxt)
result = htmlparser.htmlCtxtReadFile(
pctxt, filename, NULL, self._parse_options)
+ self._clearContext()
return self._handleResult(pctxt, result)
cdef HTMLParser __DEFAULT_HTML_PARSER
Modified: lxml/branch/resolver-new/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/branch/resolver-new/src/lxml/tests/test_etree.py (original)
+++ lxml/branch/resolver-new/src/lxml/tests/test_etree.py Thu Apr 20
18:24:25 2006
@@ -49,7 +49,26 @@
f = open(fileInTestDir('test_broken.xml'), 'r')
self.assertRaises(SyntaxError, parse, f)
f.close()
-
+
+ def test_resolve_string_dtd(self):
+ parse = self.etree.parse
+ parser = self.etree.XMLParser(dtd_validation=True)
+ assertEqual = self.assertEqual
+ test_url = u"__nosuch.dtd"
+
+ class MyResolver(self.etree.Resolver):
+ def resolve(self, url, id, context):
+ assertEqual(url, test_url)
+ return self.resolve_string(
+ u'<!ENTITY myentity "%s">' % url, context)
+
+ parser.resolvers.add(MyResolver())
+
+ xml = u'<!DOCTYPE doc SYSTEM "%s"><doc>&myentity;</doc>' % test_url
+ tree = parse(StringIO(xml), parser)
+ root = tree.getroot()
+ self.assertEquals(root.text, test_url)
+
# TypeError in etree, AssertionError in ElementTree;
def test_setitem_assert(self):
Element = self.etree.Element
Modified: lxml/branch/resolver-new/src/lxml/xmlparser.pxd
==============================================================================
--- lxml/branch/resolver-new/src/lxml/xmlparser.pxd (original)
+++ lxml/branch/resolver-new/src/lxml/xmlparser.pxd Thu Apr 20 18:24:25 2006
@@ -1,6 +1,9 @@
from tree cimport xmlDoc, xmlDict
from xmlerror cimport xmlError
+cdef extern from "libxml/tree.h":
+ ctypedef struct xmlParserInput
+
cdef extern from "libxml/parser.h":
cdef xmlDict* xmlDictCreate()
@@ -10,6 +13,7 @@
ctypedef struct xmlParserCtxt:
xmlDoc* myDoc
xmlDict* dict
+ void* _private
int wellFormed
xmlError lastError
@@ -42,3 +46,17 @@
int options)
cdef xmlDoc* xmlCtxtReadFile(xmlParserCtxt* ctxt,
char* filename, char* encoding, int options)
+
+ # entity loaders
+
+ ctypedef xmlParserInput* (*xmlExternalEntityLoader)(char * URL,
+ char * ID,
+ xmlParserCtxt* context)
+ cdef xmlExternalEntityLoader xmlGetExternalEntityLoader()
+ cdef void xmlSetExternalEntityLoader(xmlExternalEntityLoader f)
+
+cdef extern from "libxml/parserInternals.h":
+ cdef xmlParserInput* xmlNewStringInputStream(xmlParserCtxt* ctxt,
+ char* buffer)
+ cdef xmlParserInput* xmlNewInputFromFile(xmlParserCtxt* ctxt,
+ char* filename)
Modified: lxml/branch/resolver-new/src/lxml/xslt.pxi
==============================================================================
--- lxml/branch/resolver-new/src/lxml/xslt.pxi (original)
+++ lxml/branch/resolver-new/src/lxml/xslt.pxi Thu Apr 20 18:24:25 2006
@@ -40,8 +40,8 @@
cdef object _extension_functions
cdef object _utf_refs
# for exception handling and temporary reference keeping:
- cdef object _temp_elements
- cdef object _temp_docs
+ cdef _TempStore _temp_elements
+ cdef _TempStore _temp_docs
cdef object _exc_info
def __init__(self, namespaces, extensions):
@@ -68,8 +68,8 @@
self._registered_namespaces = []
self._registered_extensions = []
self._extension_functions = {}
- self._temp_elements = {}
- self._temp_docs = {}
+ self._temp_elements = _TempStore()
+ self._temp_docs = _TempStore()
cdef object _to_utf(self, s):
"Convert to UTF-8 and keep a reference to the encoded string"
@@ -174,8 +174,8 @@
cdef _release_temp_refs(self):
"Free temporarily referenced objects from this context."
- python.PyDict_Clear(self._temp_elements)
- python.PyDict_Clear(self._temp_docs)
+ self._temp_elements.clear()
+ self._temp_docs.clear()
cdef _hold(self, obj):
"""A way to temporarily hold references to nodes in the evaluator.
@@ -193,9 +193,9 @@
if isinstance(o, _NodeBase):
element = <_NodeBase>o
#print "Holding element:", <int>element._c_node
- python.PyDict_SetItem(self._temp_elements, id(element),
element)
+ self._temp_elements.add(element)
#print "Holding document:", <int>element._doc._c_doc
- python.PyDict_SetItem(self._temp_docs, id(element._doc),
element._doc)
+ self._temp_docs.add(element._doc)
################################################################################
|