logo       
Google Custom Search
    AddThis Social Bookmark Button
-->

r26055 - in lxml/branch/resolver-new/src/lxml: . tests: msg#00057

Subject: r26055 - in lxml/branch/resolver-new/src/lxml: . tests
Author: scoder
Date: Thu Apr 20 18:24:25 2006
New Revision: 26055

Modified:
   lxml/branch/resolver-new/src/lxml/etree.pyx
   lxml/branch/resolver-new/src/lxml/parser.pxi
   lxml/branch/resolver-new/src/lxml/tests/test_etree.py
   lxml/branch/resolver-new/src/lxml/xmlparser.pxd
   lxml/branch/resolver-new/src/lxml/xslt.pxi
Log:
initial working implementation of parser entity resolvers

Modified: lxml/branch/resolver-new/src/lxml/etree.pyx
==============================================================================
--- lxml/branch/resolver-new/src/lxml/etree.pyx (original)
+++ lxml/branch/resolver-new/src/lxml/etree.pyx Thu Apr 20 18:24:25 2006
@@ -1281,6 +1281,19 @@
     return ElementTree(doc.getroot())
 
 
+# class for temporary storage of Python references
+cdef class _TempStore:
+    cdef object _storage
+    def __init__(self):
+        self._storage = {}
+
+    cdef void add(self, obj):
+        python.PyDict_SetItem(self._storage, id(obj), obj)
+
+    cdef void clear(self):
+        python.PyDict_Clear(self._storage)
+
+
 # include submodules
 include "xmlerror.pxi"  # error and log handling
 include "xmlid.pxi"     # XMLID and IDDict

Modified: lxml/branch/resolver-new/src/lxml/parser.pxi
==============================================================================
--- lxml/branch/resolver-new/src/lxml/parser.pxi        (original)
+++ lxml/branch/resolver-new/src/lxml/parser.pxi        Thu Apr 20 18:24:25 2006
@@ -58,15 +58,156 @@
 __GLOBAL_PARSER_CONTEXT = _ParserContext()
 
 
+############################################################
+## Custom resolver API
+############################################################
+
+cdef class _ResolverRegistry # forward declaration
+
+cdef class _ParserInput:
+    cdef xmlparser.xmlParserInput* _input
+    cdef object _pyref # to keep Python references
+
+cdef class _ResolverContext:
+    cdef xmlparser.xmlParserCtxt* _ctxt
+    cdef _ResolverRegistry _resolvers
+    cdef _TempStore _storage
+
+cdef class Resolver:
+    def resolve(self, system_url, public_id, _ResolverContext context not 
None):
+        cdef _ParserInput parser_input
+        cdef char* c_url
+        cdef char* c_id
+        if __DEFAULT_ENTITY_LOADER is NULL:
+            return None
+        if system_url is None:
+            c_url = NULL
+        else:
+            url_utf = _utf8(system_url)
+            c_url = _cstr(url_utf)
+        if public_id is None:
+            c_id = NULL
+        else:
+            id_utf = _utf8(public_id)
+            c_id = _cstr(id_utf)
+        parser_input = _ParserInput()
+        parser_input._input = __DEFAULT_ENTITY_LOADER(
+            c_url, c_id, context._ctxt)
+        return parser_input
+
+    def resolve_string(self, string, _ResolverContext context not None):
+        cdef _ParserInput parser_input
+        string_utf = _utf8(string)
+        parser_input = _ParserInput()
+        parser_input._input = xmlparser.xmlNewStringInputStream(
+            context._ctxt, _cstr(string_utf))
+        parser_input._pyref = string_utf
+        return parser_input
+
+    def resolve_filename(self, filename, _ResolverContext context not None):
+        cdef _ParserInput parser_input
+        filename_utf = _utf8(filename)
+        parser_input = _ParserInput()
+        parser_input._input = xmlparser.xmlNewInputFromFile(
+            context._ctxt, _cstr(filename_utf))
+        return parser_input
+
+cdef class _ResolverRegistry:
+    cdef object _resolvers
+    cdef Resolver _default_resolver
+    def __init__(self, Resolver default_resolver=None):
+        try:
+            self._resolvers = set()
+        except NameError:
+            from sets import Set
+            self._resolvers = Set()
+        if default_resolver is None:
+            self._default_resolver = Resolver()
+        else:
+            self._default_resolver = default_resolver
+
+    def add(self, Resolver resolver not None):
+        """Register a resolver.
+
+        For each requested entity, the 'resolve' method of the resolver will
+        be called and the result will be passed to the parser.  If this method
+        returns None, the request will be delegated to other resolvers or the
+        default resolver.  The resolvers will be tested in an arbitrary order
+        until the first match is found.
+        """
+        self._resolvers.add(resolver)
+
+    def remove(self, resolver):
+        self._resolvers.discard(resolver)
+
+    def resolve(self, system_url, public_id, _ResolverContext context not 
None):
+        for resolver in self._resolvers:
+            result = resolver.resolve(system_url, public_id, context)
+            if result is not None:
+                return result
+        return self._default_resolver.resolve(system_url, public_id, context)
+
+cdef xmlparser.xmlParserInput* _local_resolver(char* c_url, char* c_pubid,
+                                               xmlParserCtxt* c_context):
+    cdef _ResolverContext context
+    cdef _ParserInput     parser_input
+    if c_context._private is NULL:
+        return __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
+
+    if c_url is NULL:
+        url = None
+    else:
+        url = funicode(c_url)
+    if c_pubid is NULL:
+        pubid = None
+    else:
+        pubid = funicode(c_pubid)
+
+    context = <_ResolverContext>c_context._private
+    try:
+        parser_input = context._resolvers.resolve(url, pubid, context)
+    except Exception, e:
+        print e
+    if parser_input is None:
+        return NULL
+    context._storage.add(parser_input)
+    return parser_input._input
+
+cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
+__DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
+
+xmlparser.xmlSetExternalEntityLoader(_local_resolver)
+
+############################################################
+## Parsers
+############################################################
+
 cdef class BaseParser:
     cdef _ErrorLog _error_log
+    cdef readonly object resolvers
+    cdef _ResolverContext _context
     def __init__(self):
         self._error_log = _ErrorLog()
+        self.resolvers = _ResolverRegistry()
+        self._context = None
 
     property error_log:
         def __get__(self):
             return self._error_log.copy()
 
+    cdef _initContext(self, xmlParserCtxt* c_ctxt):
+        cdef _ResolverContext context
+        __GLOBAL_PARSER_CONTEXT._initParserDict(c_ctxt)
+        context = _ResolverContext()
+        context._ctxt = c_ctxt
+        context._resolvers = self.resolvers
+        context._storage = _TempStore()
+        self._context = context
+        c_ctxt._private = <python.PyObject*>context
+
+    cdef _clearContext(self):
+        self._context = None
+
     cdef xmlDoc* _handleResult(self, xmlParserCtxt* ctxt,
                                xmlDoc* result) except NULL:
         if ctxt.wellFormed:
@@ -80,7 +221,6 @@
             raise XMLSyntaxError
         return result
 
-
 ############################################################
 ## XML parser
 ############################################################
@@ -154,10 +294,10 @@
         if pctxt is NULL:
             pctxt = self._createContext()
             self._memory_parser_ctxt = pctxt
-
-        __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt)
+        self._initContext(pctxt)
         result = xmlparser.xmlCtxtReadDoc(
             pctxt, _cstr(text_utf), NULL, NULL, self._parse_options)
+        self._clearContext()
         return self._handleResult(pctxt, result)
 
     cdef xmlDoc* _parseDocFromFile(self, char* filename) except NULL:
@@ -168,10 +308,10 @@
         if pctxt is NULL:
             pctxt = self._createContext()
             self._file_parser_ctxt = pctxt
-
-        __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt)
+        self._initContext(pctxt)
         result = xmlparser.xmlCtxtReadFile(
             pctxt, filename, NULL, self._parse_options)
+        self._clearContext()
         if result is NULL:
             if pctxt.lastError.domain == xmlerror.XML_FROM_IO:
                 self._error_log.disconnect()
@@ -203,6 +343,8 @@
     else:
         raise TypeError, "Invalid parser"
 
+def get_default_parser():
+    return __DEFAULT_PARSER
 
 ############################################################
 ## HTML parser
@@ -264,9 +406,10 @@
                 self._error_log.disconnect()
                 raise ParserError, "Failed to create parser context"
             self._memory_parser_ctxt = pctxt
-        __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt)
+        self._initContext(pctxt)
         result = htmlparser.htmlCtxtReadDoc(
             pctxt, c_text, NULL, NULL, self._parse_options)
+        self._clearContext()
         return self._handleResult(pctxt, result)
 
     cdef xmlDoc* _parseDocFromFile(self, char* filename) except NULL:
@@ -284,9 +427,10 @@
                     raise IOError, "Could not open file %s" % filename
                 raise ParserError, "Failed to create parser context"
             self._file_parser_ctxt = pctxt
-        __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt)
+        self._initContext(pctxt)
         result = htmlparser.htmlCtxtReadFile(
             pctxt, filename, NULL, self._parse_options)
+        self._clearContext()
         return self._handleResult(pctxt, result)
 
 cdef HTMLParser __DEFAULT_HTML_PARSER

Modified: lxml/branch/resolver-new/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/branch/resolver-new/src/lxml/tests/test_etree.py       (original)
+++ lxml/branch/resolver-new/src/lxml/tests/test_etree.py       Thu Apr 20 
18:24:25 2006
@@ -49,7 +49,26 @@
         f = open(fileInTestDir('test_broken.xml'), 'r')
         self.assertRaises(SyntaxError, parse, f)
         f.close()
-        
+
+    def test_resolve_string_dtd(self):
+        parse = self.etree.parse
+        parser = self.etree.XMLParser(dtd_validation=True)
+        assertEqual = self.assertEqual
+        test_url = u"__nosuch.dtd"
+
+        class MyResolver(self.etree.Resolver):
+            def resolve(self, url, id, context):
+                assertEqual(url, test_url)
+                return self.resolve_string(
+                    u'<!ENTITY myentity "%s">' % url, context)
+
+        parser.resolvers.add(MyResolver())
+
+        xml = u'<!DOCTYPE doc SYSTEM "%s"><doc>&myentity;</doc>' % test_url
+        tree = parse(StringIO(xml), parser)
+        root = tree.getroot()
+        self.assertEquals(root.text, test_url)
+
     # TypeError in etree, AssertionError in ElementTree;
     def test_setitem_assert(self):
         Element = self.etree.Element

Modified: lxml/branch/resolver-new/src/lxml/xmlparser.pxd
==============================================================================
--- lxml/branch/resolver-new/src/lxml/xmlparser.pxd     (original)
+++ lxml/branch/resolver-new/src/lxml/xmlparser.pxd     Thu Apr 20 18:24:25 2006
@@ -1,6 +1,9 @@
 from tree cimport xmlDoc, xmlDict
 from xmlerror cimport xmlError
 
+cdef extern from "libxml/tree.h":
+    ctypedef struct xmlParserInput
+
 cdef extern from "libxml/parser.h":
 
     cdef xmlDict* xmlDictCreate()
@@ -10,6 +13,7 @@
     ctypedef struct xmlParserCtxt:
         xmlDoc* myDoc
         xmlDict* dict
+        void* _private
         int wellFormed
         xmlError lastError
         
@@ -42,3 +46,17 @@
                                 int options)
     cdef xmlDoc* xmlCtxtReadFile(xmlParserCtxt* ctxt,
                                  char* filename, char* encoding, int options)
+
+    # entity loaders
+
+    ctypedef xmlParserInput* (*xmlExternalEntityLoader)(char * URL,
+                                                        char * ID, 
+                                                        xmlParserCtxt* context)
+    cdef xmlExternalEntityLoader xmlGetExternalEntityLoader()
+    cdef void xmlSetExternalEntityLoader(xmlExternalEntityLoader f)
+
+cdef extern from "libxml/parserInternals.h":
+    cdef xmlParserInput* xmlNewStringInputStream(xmlParserCtxt* ctxt, 
+                                                char* buffer)
+    cdef xmlParserInput* xmlNewInputFromFile(xmlParserCtxt* ctxt, 
+                                             char* filename)

Modified: lxml/branch/resolver-new/src/lxml/xslt.pxi
==============================================================================
--- lxml/branch/resolver-new/src/lxml/xslt.pxi  (original)
+++ lxml/branch/resolver-new/src/lxml/xslt.pxi  Thu Apr 20 18:24:25 2006
@@ -40,8 +40,8 @@
     cdef object _extension_functions
     cdef object _utf_refs
     # for exception handling and temporary reference keeping:
-    cdef object _temp_elements
-    cdef object _temp_docs
+    cdef _TempStore _temp_elements
+    cdef _TempStore _temp_docs
     cdef object _exc_info
 
     def __init__(self, namespaces, extensions):
@@ -68,8 +68,8 @@
         self._registered_namespaces = []
         self._registered_extensions = []
         self._extension_functions = {}
-        self._temp_elements = {}
-        self._temp_docs = {}
+        self._temp_elements = _TempStore()
+        self._temp_docs     = _TempStore()
 
     cdef object _to_utf(self, s):
         "Convert to UTF-8 and keep a reference to the encoded string"
@@ -174,8 +174,8 @@
 
     cdef _release_temp_refs(self):
         "Free temporarily referenced objects from this context."
-        python.PyDict_Clear(self._temp_elements)
-        python.PyDict_Clear(self._temp_docs)
+        self._temp_elements.clear()
+        self._temp_docs.clear()
         
     cdef _hold(self, obj):
         """A way to temporarily hold references to nodes in the evaluator.
@@ -193,9 +193,9 @@
             if isinstance(o, _NodeBase):
                 element = <_NodeBase>o
                 #print "Holding element:", <int>element._c_node
-                python.PyDict_SetItem(self._temp_elements, id(element), 
element)
+                self._temp_elements.add(element)
                 #print "Holding document:", <int>element._doc._c_doc
-                python.PyDict_SetItem(self._temp_docs, id(element._doc), 
element._doc)
+                self._temp_docs.add(element._doc)
 
 
 
################################################################################


<Prev in Thread] Current Thread [Next in Thread>